From b333b06772c89d96aacb5490d6a219fba7c09cc6 Mon Sep 17 00:00:00 2001
From: Mitja Felicijan <mitja.felicijan@gmail.com>
Date: Thu, 12 Feb 2026 20:57:17 +0100
Subject: Engage!

---
 llama.cpp/tests/.gitignore                         |    6 +
 llama.cpp/tests/CMakeLists.txt                     |  266 +
 llama.cpp/tests/get-model.cpp                      |   21 +
 llama.cpp/tests/get-model.h                        |    2 +
 llama.cpp/tests/peg-parser/simple-tokenize.cpp     |   37 +
 llama.cpp/tests/peg-parser/simple-tokenize.h       |    6 +
 llama.cpp/tests/peg-parser/test-basic.cpp          |  454 +
 .../tests/peg-parser/test-gbnf-generation.cpp      |  250 +
 llama.cpp/tests/peg-parser/test-json-parser.cpp    |  109 +
 .../tests/peg-parser/test-json-serialization.cpp   |   28 +
 llama.cpp/tests/peg-parser/test-unicode.cpp        |  449 +
 llama.cpp/tests/peg-parser/tests.h                 |   24 +
 llama.cpp/tests/run-json-schema-to-grammar.mjs     |   10 +
 llama.cpp/tests/test-alloc.cpp                     |  608 ++
 llama.cpp/tests/test-arg-parser.cpp                |  208 +
 llama.cpp/tests/test-autorelease.cpp               |   24 +
 llama.cpp/tests/test-backend-ops.cpp               | 8943 ++++++++++++++++++++
 llama.cpp/tests/test-backend-sampler.cpp           | 1165 +++
 llama.cpp/tests/test-barrier.cpp                   |  236 +
 llama.cpp/tests/test-c.c                           |    3 +
 llama.cpp/tests/test-chat-parser.cpp               |  617 ++
 llama.cpp/tests/test-chat-peg-parser.cpp           |  768 ++
 llama.cpp/tests/test-chat-template.cpp             |  680 ++
 llama.cpp/tests/test-chat.cpp                      | 4063 +++++++++
 llama.cpp/tests/test-double-float.cpp              |   57 +
 llama.cpp/tests/test-gbnf-validator.cpp            |  109 +
 llama.cpp/tests/test-gguf.cpp                      | 1355 +++
 llama.cpp/tests/test-grammar-integration.cpp       | 1439 ++++
 llama.cpp/tests/test-grammar-llguidance.cpp        | 1204 +++
 llama.cpp/tests/test-grammar-parser.cpp            |  533 ++
 llama.cpp/tests/test-jinja.cpp                     | 2290 +++++
 llama.cpp/tests/test-json-partial.cpp              |  287 +
 llama.cpp/tests/test-json-schema-to-grammar.cpp    | 1490 ++++
 llama.cpp/tests/test-llama-grammar.cpp             |  408 +
 llama.cpp/tests/test-log.cpp                       |   39 +
 llama.cpp/tests/test-lora-conversion-inference.sh  |  144 +
 llama.cpp/tests/test-model-load-cancel.cpp         |   27 +
 llama.cpp/tests/test-mtmd-c-api.c                  |   63 +
 llama.cpp/tests/test-opt.cpp                       | 1003 +++
 llama.cpp/tests/test-peg-parser.cpp                |   25 +
 llama.cpp/tests/test-quantize-fns.cpp              |  186 +
 llama.cpp/tests/test-quantize-perf.cpp             |  356 +
 llama.cpp/tests/test-quantize-stats.cpp            |  424 +
 llama.cpp/tests/test-regex-partial.cpp             |  288 +
 llama.cpp/tests/test-rope.cpp                      |  263 +
 llama.cpp/tests/test-sampling.cpp                  |  400 +
 llama.cpp/tests/test-state-restore-fragmented.cpp  |  122 +
 llama.cpp/tests/test-thread-safety.cpp             |  164 +
 llama.cpp/tests/test-tokenizer-0.cpp               |  312 +
 llama.cpp/tests/test-tokenizer-0.py                |   46 +
 llama.cpp/tests/test-tokenizer-0.sh                |   41 +
 llama.cpp/tests/test-tokenizer-1-bpe.cpp           |  155 +
 llama.cpp/tests/test-tokenizer-1-spm.cpp           |  125 +
 llama.cpp/tests/test-tokenizer-random.py           |  566 ++
 llama.cpp/tests/test-tokenizers-repo.sh            |   43 +
 llama.cpp/tests/testing.h                          |  243 +
 56 files changed, 33184 insertions(+)
 create mode 100644 llama.cpp/tests/.gitignore
 create mode 100644 llama.cpp/tests/CMakeLists.txt
 create mode 100644 llama.cpp/tests/get-model.cpp
 create mode 100644 llama.cpp/tests/get-model.h
 create mode 100644 llama.cpp/tests/peg-parser/simple-tokenize.cpp
 create mode 100644 llama.cpp/tests/peg-parser/simple-tokenize.h
 create mode 100644 llama.cpp/tests/peg-parser/test-basic.cpp
 create mode 100644 llama.cpp/tests/peg-parser/test-gbnf-generation.cpp
 create mode 100644 llama.cpp/tests/peg-parser/test-json-parser.cpp
 create mode 100644 llama.cpp/tests/peg-parser/test-json-serialization.cpp
 create mode 100644 llama.cpp/tests/peg-parser/test-unicode.cpp
 create mode 100644 llama.cpp/tests/peg-parser/tests.h
 create mode 100644 llama.cpp/tests/run-json-schema-to-grammar.mjs
 create mode 100644 llama.cpp/tests/test-alloc.cpp
 create mode 100644 llama.cpp/tests/test-arg-parser.cpp
 create mode 100644 llama.cpp/tests/test-autorelease.cpp
 create mode 100644 llama.cpp/tests/test-backend-ops.cpp
 create mode 100644 llama.cpp/tests/test-backend-sampler.cpp
 create mode 100644 llama.cpp/tests/test-barrier.cpp
 create mode 100644 llama.cpp/tests/test-c.c
 create mode 100644 llama.cpp/tests/test-chat-parser.cpp
 create mode 100644 llama.cpp/tests/test-chat-peg-parser.cpp
 create mode 100644 llama.cpp/tests/test-chat-template.cpp
 create mode 100644 llama.cpp/tests/test-chat.cpp
 create mode 100644 llama.cpp/tests/test-double-float.cpp
 create mode 100644 llama.cpp/tests/test-gbnf-validator.cpp
 create mode 100644 llama.cpp/tests/test-gguf.cpp
 create mode 100644 llama.cpp/tests/test-grammar-integration.cpp
 create mode 100644 llama.cpp/tests/test-grammar-llguidance.cpp
 create mode 100644 llama.cpp/tests/test-grammar-parser.cpp
 create mode 100644 llama.cpp/tests/test-jinja.cpp
 create mode 100644 llama.cpp/tests/test-json-partial.cpp
 create mode 100755 llama.cpp/tests/test-json-schema-to-grammar.cpp
 create mode 100644 llama.cpp/tests/test-llama-grammar.cpp
 create mode 100644 llama.cpp/tests/test-log.cpp
 create mode 100755 llama.cpp/tests/test-lora-conversion-inference.sh
 create mode 100644 llama.cpp/tests/test-model-load-cancel.cpp
 create mode 100644 llama.cpp/tests/test-mtmd-c-api.c
 create mode 100644 llama.cpp/tests/test-opt.cpp
 create mode 100644 llama.cpp/tests/test-peg-parser.cpp
 create mode 100644 llama.cpp/tests/test-quantize-fns.cpp
 create mode 100644 llama.cpp/tests/test-quantize-perf.cpp
 create mode 100644 llama.cpp/tests/test-quantize-stats.cpp
 create mode 100644 llama.cpp/tests/test-regex-partial.cpp
 create mode 100644 llama.cpp/tests/test-rope.cpp
 create mode 100644 llama.cpp/tests/test-sampling.cpp
 create mode 100644 llama.cpp/tests/test-state-restore-fragmented.cpp
 create mode 100644 llama.cpp/tests/test-thread-safety.cpp
 create mode 100644 llama.cpp/tests/test-tokenizer-0.cpp
 create mode 100644 llama.cpp/tests/test-tokenizer-0.py
 create mode 100755 llama.cpp/tests/test-tokenizer-0.sh
 create mode 100644 llama.cpp/tests/test-tokenizer-1-bpe.cpp
 create mode 100644 llama.cpp/tests/test-tokenizer-1-spm.cpp
 create mode 100644 llama.cpp/tests/test-tokenizer-random.py
 create mode 100755 llama.cpp/tests/test-tokenizers-repo.sh
 create mode 100644 llama.cpp/tests/testing.h

(limited to 'llama.cpp/tests')

diff --git a/llama.cpp/tests/.gitignore b/llama.cpp/tests/.gitignore
new file mode 100644
index 0000000..ba2b164
--- /dev/null
+++ b/llama.cpp/tests/.gitignore
@@ -0,0 +1,6 @@
+*
+!*.*
+*.o
+ggml-common.h
+**/*.swp
+!peg-parser
diff --git a/llama.cpp/tests/CMakeLists.txt b/llama.cpp/tests/CMakeLists.txt
new file mode 100644
index 0000000..350bffc
--- /dev/null
+++ b/llama.cpp/tests/CMakeLists.txt
@@ -0,0 +1,266 @@
+llama_add_compile_flags()
+
+function(llama_build source)
+    set(TEST_SOURCES ${source} ${ARGN})
+
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_TARGET ${LLAMA_TEST_NAME})
+    else()
+        get_filename_component(TEST_TARGET ${source} NAME_WE)
+    endif()
+
+    add_executable(${TEST_TARGET} ${TEST_SOURCES})
+    target_link_libraries(${TEST_TARGET} PRIVATE common)
+    if (LLAMA_TESTS_INSTALL)
+        install(TARGETS ${TEST_TARGET} RUNTIME)
+    endif()
+endfunction()
+
+function(llama_test target)
+    include(CMakeParseArguments)
+    set(options)
+    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if (NOT DEFINED LLAMA_TEST_LABEL)
+        set(LLAMA_TEST_LABEL "main")
+    endif()
+    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
+        set(LLAMA_TEST_WORKING_DIRECTORY .)
+    endif()
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_NAME ${LLAMA_TEST_NAME})
+    else()
+        set(TEST_NAME ${target})
+    endif()
+
+    set(TEST_TARGET ${target})
+
+    add_test(
+        NAME ${TEST_NAME}
+        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
+        COMMAND $<TARGET_FILE:${TEST_TARGET}>
+        ${LLAMA_TEST_ARGS})
+
+    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
+endfunction()
+
+function(llama_test_cmd target)
+    include(CMakeParseArguments)
+    set(options)
+    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if (NOT DEFINED LLAMA_TEST_LABEL)
+        set(LLAMA_TEST_LABEL "main")
+    endif()
+    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
+        set(LLAMA_TEST_WORKING_DIRECTORY .)
+    endif()
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_NAME ${LLAMA_TEST_NAME})
+    else()
+        set(TEST_NAME ${target})
+    endif()
+
+    add_test(
+        NAME ${TEST_NAME}
+        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
+        COMMAND ${target}
+        ${LLAMA_TEST_ARGS})
+
+    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
+endfunction()
+
+# Builds and runs a test source file.
+# Optional args:
+# - NAME: name of the executable & test target (defaults to the source file name without extension)
+# - LABEL: label for the test (defaults to main)
+# - ARGS: arguments to pass to the test executable
+# - WORKING_DIRECTORY
+function(llama_build_and_test source)
+    include(CMakeParseArguments)
+    set(options)
+    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(TEST_SOURCES ${source} ${LLAMA_TEST_UNPARSED_ARGUMENTS} get-model.cpp)
+
+    if (NOT DEFINED LLAMA_TEST_LABEL)
+        set(LLAMA_TEST_LABEL "main")
+    endif()
+    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
+        set(LLAMA_TEST_WORKING_DIRECTORY .)
+    endif()
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_TARGET ${LLAMA_TEST_NAME})
+    else()
+        get_filename_component(TEST_TARGET ${source} NAME_WE)
+    endif()
+
+    add_executable(${TEST_TARGET} ${TEST_SOURCES})
+    if (LLAMA_TESTS_INSTALL)
+        install(TARGETS ${TEST_TARGET} RUNTIME)
+    endif()
+    target_link_libraries(${TEST_TARGET} PRIVATE common)
+
+    add_test(
+        NAME ${TEST_TARGET}
+        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
+        COMMAND $<TARGET_FILE:${TEST_TARGET}>
+        ${LLAMA_TEST_ARGS})
+
+    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
+endfunction()
+
+# build test-tokenizer-0 target once and add many tests
+llama_build(test-tokenizer-0.cpp)
+
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-bert-bge.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-command-r.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-coder.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-llm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-phi-3.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-qwen2.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
+
+if (NOT WIN32)
+    llama_test_cmd(
+        ${CMAKE_CURRENT_SOURCE_DIR}/test-tokenizers-repo.sh
+        NAME test-tokenizers-ggml-vocabs
+        WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+        ARGS https://huggingface.co/ggml-org/vocabs ${PROJECT_SOURCE_DIR}/models/ggml-vocabs
+    )
+endif()
+
+if (LLAMA_LLGUIDANCE)
+    llama_build_and_test(test-grammar-llguidance.cpp ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
+endif ()
+
+if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
+    # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API (when building with shared libraries)
+    llama_build_and_test(test-sampling.cpp)
+    llama_build_and_test(test-grammar-parser.cpp)
+    llama_build_and_test(test-grammar-integration.cpp)
+    llama_build_and_test(test-llama-grammar.cpp)
+    llama_build_and_test(test-chat.cpp)
+    # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
+    if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+        llama_build_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
+        target_include_directories(test-json-schema-to-grammar PRIVATE ${PROJECT_SOURCE_DIR}/tools/server)
+    endif()
+
+    if (NOT GGML_BACKEND_DL)
+        llama_build(test-quantize-stats.cpp)
+    endif()
+
+    llama_build(test-gbnf-validator.cpp)
+
+    # build test-tokenizer-1-bpe target once and add many tests
+    llama_build(test-tokenizer-1-bpe.cpp)
+
+    # TODO: disabled due to slowness
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-aquila.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-neox.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf --ignore-merges)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
+
+    # build test-tokenizer-1-spm target once and add many tests
+    llama_build(test-tokenizer-1-spm.cpp)
+
+    llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
+    #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-baichuan.gguf)
+
+    # llama_build_and_test(test-double-float.cpp) # SLOW
+endif()
+
+llama_build_and_test(test-chat-parser.cpp)
+llama_build_and_test(test-chat-peg-parser.cpp peg-parser/simple-tokenize.cpp)
+llama_build_and_test(test-chat-template.cpp)
+llama_build_and_test(test-jinja.cpp)
+llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python)
+llama_build_and_test(test-json-partial.cpp)
+llama_build_and_test(test-log.cpp)
+llama_build_and_test(
+    test-peg-parser.cpp
+    peg-parser/simple-tokenize.cpp
+    peg-parser/test-basic.cpp
+    peg-parser/test-gbnf-generation.cpp
+    peg-parser/test-json-parser.cpp
+    peg-parser/test-json-serialization.cpp
+    peg-parser/test-unicode.cpp
+    peg-parser/tests.h
+)
+llama_build_and_test(test-regex-partial.cpp)
+
+if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+    set(MODEL_NAME "tinyllamas/stories15M-q4_0.gguf")
+    set(MODEL_HASH "SHA256=66967fbece6dbe97886593fdbb73589584927e29119ec31f08090732d1861739")
+else()
+    set(MODEL_NAME "tinyllamas/stories15M-be.Q4_0.gguf")
+    set(MODEL_HASH "SHA256=9aec857937849d976f30397e97eb1cabb53eb9dcb1ce4611ba8247fb5f44c65d")
+endif()
+set(MODEL_DEST "${CMAKE_BINARY_DIR}/${MODEL_NAME}")
+
+add_test(NAME test-download-model COMMAND ${CMAKE_COMMAND}
+    -DDEST=${MODEL_DEST}
+    -DNAME=${MODEL_NAME}
+    -DHASH=${MODEL_HASH}
+    -P ${CMAKE_SOURCE_DIR}/cmake/download-models.cmake
+)
+set_tests_properties(test-download-model PROPERTIES FIXTURES_SETUP test-download-model)
+
+llama_build_and_test(test-thread-safety.cpp ARGS -m "${MODEL_DEST}" -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
+set_tests_properties(test-thread-safety PROPERTIES FIXTURES_REQUIRED test-download-model)
+
+llama_build_and_test(test-arg-parser.cpp)
+
+if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
+  # TODO: repair known memory leaks
+  llama_build_and_test(test-opt.cpp)
+endif()
+llama_build_and_test(test-gguf.cpp)
+llama_build_and_test(test-backend-ops.cpp)
+
+llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
+llama_build_and_test(test-autorelease.cpp       LABEL "model")
+llama_build_and_test(test-backend-sampler.cpp   LABEL "model")
+
+# Test for state restore with fragmented KV cache
+# Requires a model, uses same args pattern as test-thread-safety
+llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -m "${MODEL_DEST}")
+set_tests_properties(test-state-restore-fragmented PROPERTIES FIXTURES_REQUIRED test-download-model)
+
+if (NOT GGML_BACKEND_DL)
+    # these tests use the backends directly and cannot be built with dynamic loading
+    llama_build_and_test(test-barrier.cpp)
+    llama_build_and_test(test-quantize-fns.cpp)
+    llama_build_and_test(test-quantize-perf.cpp)
+    llama_build_and_test(test-rope.cpp)
+endif()
+
+# libmtmd
+set(LLAMA_TEST_NAME test-mtmd-c-api)
+llama_build_and_test(test-mtmd-c-api.c)
+target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
+
+# dummy executable - not installed
+get_filename_component(TEST_TARGET test-c.c NAME_WE)
+add_executable(${TEST_TARGET} test-c.c)
+target_link_libraries(${TEST_TARGET} PRIVATE llama)
+
+llama_build_and_test(test-alloc.cpp)
+target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
diff --git a/llama.cpp/tests/get-model.cpp b/llama.cpp/tests/get-model.cpp
new file mode 100644
index 0000000..4edb685
--- /dev/null
+++ b/llama.cpp/tests/get-model.cpp
@@ -0,0 +1,21 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "get-model.h"
+
+char * get_model_or_exit(int argc, char *argv[]) {
+    char * model_path;
+    if (argc > 1) {
+        model_path = argv[1];
+
+    } else {
+        model_path = getenv("LLAMACPP_TEST_MODELFILE");
+        if (!model_path || strlen(model_path) == 0) {
+            fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    return model_path;
+}
diff --git a/llama.cpp/tests/get-model.h b/llama.cpp/tests/get-model.h
new file mode 100644
index 0000000..81a3a0f
--- /dev/null
+++ b/llama.cpp/tests/get-model.h
@@ -0,0 +1,2 @@
+#pragma once
+char * get_model_or_exit(int, char*[]);
diff --git a/llama.cpp/tests/peg-parser/simple-tokenize.cpp b/llama.cpp/tests/peg-parser/simple-tokenize.cpp
new file mode 100644
index 0000000..9abfa04
--- /dev/null
+++ b/llama.cpp/tests/peg-parser/simple-tokenize.cpp
@@ -0,0 +1,37 @@
+#include "simple-tokenize.h"
+
+std::vector<std::string> simple_tokenize(const std::string & input) {
+    std::vector<std::string> result;
+    std::string              current;
+
+    for (size_t i = 0; i < input.size(); i++) {
+        switch (input[i]) {
+            case ' ':
+            case '\n':
+            case '\t':
+            case '{':
+            case '}':
+            case ',':
+            case '[':
+            case '"':
+            case ']':
+            case '.':
+            case '<':
+            case '>':
+            case '=':
+            case '/':
+                if (!current.empty()) {
+                    result.push_back(current);
+                    current.clear();
+                }
+            default:;
+        }
+        current += input[i];
+    }
+
+    if (!current.empty()) {
+        result.push_back(current);
+    }
+
+    return result;
+}
diff --git a/llama.cpp/tests/peg-parser/simple-tokenize.h b/llama.cpp/tests/peg-parser/simple-tokenize.h
new file mode 100644
index 0000000..1772432
--- /dev/null
+++ b/llama.cpp/tests/peg-parser/simple-tokenize.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+std::vector<std::string> simple_tokenize(const std::string &);
diff --git a/llama.cpp/tests/peg-parser/test-basic.cpp b/llama.cpp/tests/peg-parser/test-basic.cpp
new file mode 100644
index 0000000..1bda6f2
--- /dev/null
+++ b/llama.cpp/tests/peg-parser/test-basic.cpp
@@ -0,0 +1,454 @@
+#include "tests.h"
+
+void test_basic(testing & t) {
+    t.test("chars", [](testing & t) {
+        // Test common escape sequences - newline
+        t.test("escape_sequence_newline", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("\n");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_newline", true, result.success());
+        });
+
+        // Test common escape sequences - tab
+        t.test("escape_sequence_tab", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("\t");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_tab", true, result.success());
+        });
+
+        // Test common escape sequences - backslash
+        t.test("escape_sequence_backslash", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("\\");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_backslash", true, result.success());
+        });
+
+        // Test common escape sequences - space (should ())
+        t.test("escape_sequence_space_fail", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context(" ");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_space_fail", true, result.fail());
+        });
+
+        // Test escaped dash - 'a' should succeed
+        t.test("escaped_dash_a", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("a");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_a", true, result.success());
+        });
+
+        // Test escaped dash - '-' should succeed (literal dash)
+        t.test("escaped_dash_literal", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("-");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_literal", true, result.success());
+        });
+
+        // Test escaped dash - 'z' should succeed
+        t.test("escaped_dash_z", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("z");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_z", true, result.success());
+        });
+
+        // Test escaped dash - 'b' should NOT match (since \- is literal dash, not range)
+        t.test("escaped_dash_b_fail", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("b");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_b_fail", true, result.fail());
+        });
+    });
+
+
+    t.test("optional", [](testing & t) {
+        // Full match with optional part present
+        t.test("optional_present", [](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.literal("hello") + p.optional(p.literal(" world"));
+            });
+
+            auto ctx    = common_peg_parse_context("hello world");
+            auto result = parser.parse(ctx);
+            t.assert_equal("optional_present", true, result.success());
+            t.assert_equal("optional_present_end", 11u, result.end);
+        });
+
+        // Full match with optional part absent
+        t.test("optional_absent", [](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.literal("hello") + p.optional(p.literal(" world"));
+            });
+
+            auto ctx    = common_peg_parse_context("hello", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("optional_absent", true, result.success());
+            t.assert_equal("optional_absent_end", 5u, result.end);
+        });
+
+        // Partial match - waiting for more input to determine if optional matches
+        t.test("partial_match_need_more", [](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.literal("hello") + p.optional(p.literal(" world"));
+            });
+
+            auto ctx    = common_peg_parse_context("hello ", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("partial_match_need_more", true, result.need_more_input());
+        });
+    });
+
+    t.test("partial parsing", [](testing & t) {
+        // Literals - Basic Success
+        t.test("literal_success", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("hello");
+            result = parser.parse(ctx);
+            t.assert_equal("literal_success", true, result.success());
+        });
+
+        // Char Classes - Basic Lowercase Success
+        t.test("char_class_lowercase_success", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("a");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_lowercase_success", true, result.success());
+        });
+
+        // Char Classes - Uppercase Fail
+        t.test("char_class_uppercase_fail", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("A");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_uppercase_fail", true, result.fail());
+        });
+
+        // Char Classes with Dash - Lowercase Success
+        t.test("char_class_with_dash_lowercase", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("f");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_with_dash_lowercase", true, result.success());
+        });
+
+        // Char Classes with Dash - Literal Dash Success
+        t.test("char_class_with_dash_literal_dash", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("-");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_with_dash_literal_dash", true, result.success());
+        });
+
+        // Char Classes with Dash - Uppercase Fail
+        t.test("char_class_with_dash_uppercase_fail", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("A");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_with_dash_uppercase_fail", true, result.fail());
+        });
+
+        // Sequences - Partial Match 1
+        t.test("sequence_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+            auto ctx    = common_peg_parse_context("<thi", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_partial_match_1", true, result.need_more_input());
+        });
+
+        // Sequences - Partial Match 2
+        t.test("sequence_partial_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("begin") + p.literal("end"); });
+
+            auto ctx    = common_peg_parse_context("begin", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_partial_match_2", true, result.need_more_input());
+        });
+
+        // Sequences - Partial Match 3
+        t.test("sequence_partial_match_3", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+            auto ctx    = common_peg_parse_context("<think></", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_partial_match_3", true, result.need_more_input());
+        });
+
+        // Sequences - Full Match
+        t.test("sequence_full_match", [&](testing & t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello") + p.literal("world"); });
+
+            auto ctx    = common_peg_parse_context("helloworld", false);
+            auto result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("sequence_full_match", true, result.success());
+        });
+
+        // Sequences - No Match
+        t.test("sequence_no_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+            auto ctx    = common_peg_parse_context("<think>I am common_chat_combinator_parser", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_no_match", true, result.fail());
+        });
+
+        // Choices - Partial Match 1
+        t.test("choices_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("option1") | p.literal("option2"); });
+
+            auto ctx    = common_peg_parse_context("opt", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_partial_match_1", true, result.need_more_input());
+        });
+
+        // Choices - Partial Match 2
+        t.test("choices_partial_match_2", [&](testing & t) {
+            auto parser =
+                build_peg_parser([](common_peg_parser_builder & p) { return p.literal("choice_a") | p.literal("choice_b"); });
+
+            auto ctx    = common_peg_parse_context("choice", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_partial_match_2", true, result.need_more_input());
+        });
+
+        // Choices - Full Match 1
+        t.test("choices_full_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("first") | p.literal("second"); });
+
+            auto ctx    = common_peg_parse_context("first", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_full_match_1", true, result.success());
+        });
+
+        // Choices - Full Match 2
+        t.test("choices_full_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("alpha") | p.literal("beta"); });
+
+            auto ctx    = common_peg_parse_context("beta", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_full_match_2", true, result.success());
+        });
+
+        // Choices - No Match
+        t.test("choices_no_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("good") | p.literal("better"); });
+
+            auto ctx    = common_peg_parse_context("best", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_no_match", true, result.fail());
+        });
+
+        // Zero or More - Partial Match 1
+        t.test("zero_or_more_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("ab")); });
+
+            auto ctx    = common_peg_parse_context("a", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("zero_or_more_partial_match_1", true, result.need_more_input());
+        });
+
+        // Zero or More - Partial Match 2
+        t.test("zero_or_more_partial_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("xy")); });
+
+            auto ctx    = common_peg_parse_context("xyx", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("zero_or_more_partial_match_2", true, result.need_more_input());
+        });
+
+        // Zero or More - Full Match
+        t.test("zero_or_more_full_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("test")); });
+
+            auto ctx    = common_peg_parse_context("test", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("zero_or_more_full_match", true, result.success());
+        });
+
+        // One or More - Partial Match 1
+        t.test("one_or_more_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("repeat")); });
+
+            auto ctx    = common_peg_parse_context("rep", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_partial_match_1", true, result.need_more_input());
+        });
+
+        // One or More - Partial Match 2
+        t.test("one_or_more_partial_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("ab")); });
+
+            auto ctx    = common_peg_parse_context("aba", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_partial_match_2", true, result.need_more_input());
+        });
+
+        // One or More - Full Match
+        t.test("one_or_more_full_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("single")); });
+
+            auto ctx    = common_peg_parse_context("single", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_full_match", true, result.success());
+        });
+
+        // One or More - No Match
+        t.test("one_or_more_no_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("()")); });
+
+            auto ctx    = common_peg_parse_context("success", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_no_match", true, result.fail());
+        });
+    });
+
+
+    t.test("recursive rules", [](testing &t) {
+        // Test simple number
+        t.test("simple_number", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("1", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test simple list
+        t.test("simple_list", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[1]", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test nested list
+        t.test("nested_list", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[[2]]", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test deeply nested list
+        t.test("deeply_nested_list", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[[[3]]]", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test need_more_input match
+        t.test("need_more_input_match", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[[", true);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+        });
+
+        // Test no match
+        t.test("no_match", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[a]", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_fail", true, result.fail());
+        });
+    });
+}
diff --git a/llama.cpp/tests/peg-parser/test-gbnf-generation.cpp b/llama.cpp/tests/peg-parser/test-gbnf-generation.cpp
new file mode 100644
index 0000000..68857a5
--- /dev/null
+++ b/llama.cpp/tests/peg-parser/test-gbnf-generation.cpp
@@ -0,0 +1,250 @@
+#include "tests.h"
+
+#include "json-schema-to-grammar.h"
+
+#include <regex>
+
+static std::string trim_leading_space(const std::string & s) {
+    static const std::regex leading_ws_re = std::regex(R"((^|\n)\s+)");
+    return std::regex_replace(s, leading_ws_re, "$1");
+}
+
+static void assert_gbnf_equal(testing & t, const std::string & expected, const std::string & actual) {
+    t.assert_equal("gbnf are equal", trim_leading_space(expected), trim_leading_space(actual));
+}
+
+void test_gbnf_generation(testing &t) {
+    t.test("literal grammar generation", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("char class grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.chars("[a-z]", 1, 1);
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= [a-z]
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("sequence grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") + p.literal(" ") + p.literal("world");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello" " " "world"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("choice grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("cat") | p.literal("dog");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "cat" | "dog"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("one_or_more grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.one_or_more(p.literal("a"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a"+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("zero_or_more grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.zero_or_more(p.literal("a"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a"*
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("optional grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") + p.optional(p.literal(" world"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello" " world"?
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("until grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p)  {
+            return p.until("</tag>");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])*
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("complex expressions with parentheses", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.one_or_more(p.literal("a") | p.literal("b"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= ("a" | "b")+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("rule references", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            auto digit = p.rule("digit", p.chars("[0-9]", 1, 1));
+            return p.one_or_more(digit);
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            digit ::= [0-9]
+            root ::= digit+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("escaping in literals", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello\nworld\n!");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello\nworld\n!"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("operator<< (whitespace insertion)", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") << p.literal("world");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello" space "world"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("emit only reachable rules", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            p.rule("orphan", p.literal("orphan"));
+            return p.literal("hello") + p.rule("child", p.literal(" world"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            child ::= " world"
+            root ::= "hello" child
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("emit only trigger rules (and references)", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            auto rule1 = p.rule("rule-1", p.literal("a") + p.ref("rule-2"));
+            p.rule("rule-2", p.literal("b") + p.ref("rule-3"), true);
+            p.rule("rule-3", p.literal("c") + p.ref("rule-4"));
+            p.rule("rule-4", p.literal("d"), true);
+            return rule1;
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= rule-1
+            rule-1 ::= "a" rule-2
+            rule-2 ::= "b" rule-3
+            rule-3 ::= "c" rule-4
+            rule-4 ::= "d"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+
+        auto gbnf_lazy = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder, true);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= rule-2 | rule-4
+            rule-2 ::= "b" rule-3
+            rule-3 ::= "c" rule-4
+            rule-4 ::= "d"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf_lazy);
+    });
+}
diff --git a/llama.cpp/tests/peg-parser/test-json-parser.cpp b/llama.cpp/tests/peg-parser/test-json-parser.cpp
new file mode 100644
index 0000000..48351cd
--- /dev/null
+++ b/llama.cpp/tests/peg-parser/test-json-parser.cpp
@@ -0,0 +1,109 @@
+#include "tests.h"
+
+void test_json_parser(testing &t) {
+    // Test parsing a simple JSON object
+    t.test("simple JSON object parsing", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"({"name": "test", "value": 42, "flag": true})";
+        common_peg_parse_context ctx(input);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test parsing a JSON array with mixed types
+    t.test("JSON array with mixed types", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"([1, "hello", true, null, 3.14])";
+        common_peg_parse_context ctx(input);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test parsing nested JSON with objects and arrays
+    t.test("nested JSON with objects and arrays", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string input =
+            R"({"users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}], "count": 2, "metadata": {"version": "1.0", "tags": ["admin", "user"]}})";
+        common_peg_parse_context ctx(input);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test need_more_input() parsing - incomplete object
+    t.test("need_more_input() parsing - incomplete object", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"({"name": "test", "value": )";
+        common_peg_parse_context ctx(input, true);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    // Test need_more_input() parsing - incomplete array
+    t.test("need_more_input() parsing - incomplete array", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"([1, 2, 3, )";
+        common_peg_parse_context ctx(input, true);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    // Test need_more_input() parsing - incomplete nested structure
+    t.test("need_more_input() parsing - incomplete nested structure", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"({"data": {"nested": )";
+        common_peg_parse_context ctx(input, true);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    t.test("object member", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.json_member("name", "\"" + p.chars("[a-z]") + "\"");
+        });
+
+        t.test("success", [&](testing &t) {
+            std::string input = R"("name": "bob")";
+            common_peg_parse_context ctx(input, false);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+        });
+
+        t.test("partial", [&](testing &t) {
+            std::string input = R"("name": "bo)";
+            common_peg_parse_context ctx(input, true);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("need more input", result.need_more_input());
+        });
+
+        t.test("failed", [&](testing &t) {
+            std::string input = R"([])";
+            common_peg_parse_context ctx(input, false);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("fail", result.fail());
+        });
+    });
+}
diff --git a/llama.cpp/tests/peg-parser/test-json-serialization.cpp b/llama.cpp/tests/peg-parser/test-json-serialization.cpp
new file mode 100644
index 0000000..a858010
--- /dev/null
+++ b/llama.cpp/tests/peg-parser/test-json-serialization.cpp
@@ -0,0 +1,28 @@
+#include "tests.h"
+
+void test_json_serialization(testing &t) {
+    auto original = build_peg_parser([](common_peg_parser_builder & p) {
+        return "<tool_call>" + p.json() + "</tool_call>";
+    });
+
+    auto json_serialized = original.to_json().dump();
+
+    t.test("compare before/after", [&](testing &t) {
+        auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized));
+
+        // Test complex JSON
+        std::string input = R"({"name": "test", "values": [1, 2, 3], "nested": {"a": true}})";
+        common_peg_parse_context ctx1(input);
+        common_peg_parse_context ctx2(input);
+
+        auto result1 = original.parse(ctx1);
+        auto result2 = deserialized.parse(ctx2);
+
+        t.assert_equal("both_succeed", result1.success(), result2.success());
+        t.assert_equal("same_end_pos", result1.end, result2.end);
+    });
+
+    t.bench("deserialize", [&]() {
+        auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized));
+    }, 100);
+}
diff --git a/llama.cpp/tests/peg-parser/test-unicode.cpp b/llama.cpp/tests/peg-parser/test-unicode.cpp
new file mode 100644
index 0000000..19d9b9e
--- /dev/null
+++ b/llama.cpp/tests/peg-parser/test-unicode.cpp
@@ -0,0 +1,449 @@
+#include "tests.h"
+
+#include "peg-parser.h"
+
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <cctype>
+
+static void assert_result_equal(testing & t, common_peg_parse_result_type expected, common_peg_parse_result_type actual) {
+    t.assert_equal(common_peg_parse_result_type_name(expected), common_peg_parse_result_type_name(actual));
+}
+
+static std::string hex_dump(const std::string& str) {
+    std::ostringstream oss;
+    for (unsigned char c : str) {
+        if (std::isprint(c)) {
+            oss << c;
+        } else {
+            oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
+        }
+    }
+    return oss.str();
+}
+
+void test_unicode(testing &t) {
+    struct test_case {
+        std::string input;
+        std::string expected_text;
+        common_peg_parse_result_type expected_result;
+    };
+
+    t.test("any", [](testing &t) {
+        std::vector<test_case> test_cases {
+            // Valid UTF-8 sequences
+            {"Hello", "Hello", COMMON_PEG_PARSE_RESULT_SUCCESS},
+            {std::string("Caf\xC3\xA9"), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            {std::string("\xF0\x9F\x9A\x80"), std::string("\xF0\x9F\x9A\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+            // Incomplete UTF-8 sequences (partial bytes at end)
+            {std::string("Caf\xC3"), "Caf", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            {std::string("\xE4\xBD"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            {std::string("\xF0\x9F\x9A"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+            // Invalid/malformed UTF-8 sequences
+            {std::string("\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+            {std::string("Hello\x80World"), "Hello", COMMON_PEG_PARSE_RESULT_FAIL},
+            {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+        };
+
+        auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+            return p.sequence({p.one_or_more(p.any()), p.end()});
+        });
+
+        for (size_t i = 0; i < test_cases.size(); i++) {
+            const auto & tc = test_cases[i];
+            std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+            t.test(test_name, [&](testing &t) {
+                common_peg_parse_context ctx(tc.input, true);
+                auto result = parser.parse(ctx);
+
+                // Assert result type matches
+                assert_result_equal(t, tc.expected_result, result.type);
+
+                // Assert matched text if success or need_more_input
+                if (result.success() || result.need_more_input()) {
+                    std::string matched = tc.input.substr(result.start, result.end - result.start);
+                    t.assert_equal(tc.expected_text, matched);
+                }
+            });
+        }
+    });
+
+    t.test("char classes", [](testing &t) {
+        t.test("unicode range U+4E00-U+9FFF (CJK)", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Within range - CJK Unified Ideographs
+                {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
+                {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
+                {std::string("\xE5\xA5\xBD"), std::string("\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+597D
+                {std::string("\xE9\xBF\xBF"), std::string("\xE9\xBF\xBF"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+9FFF
+
+                // Outside range - should fail
+                {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},                                                     // ASCII
+                {std::string("\xE4\xB7\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL},                            // U+4DFF (before range)
+                {std::string("\xEA\x80\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},                            // U+A000 (after range)
+
+                // Incomplete sequences in range
+                {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},                     // Incomplete U+4E00
+                {std::string("\xE5\xA5"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},                     // Incomplete U+597D
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.sequence({p.chars(R"([\u4E00-\u9FFF])"), p.end()});
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    // Assert result type matches
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    // Assert matched text if success or need_more_input
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("unicode range U+1F600-U+1F64F (emoticons)", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Within range - Emoticons (all 4-byte UTF-8)
+                {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
+                {std::string("\xF0\x9F\x98\x81"), std::string("\xF0\x9F\x98\x81"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F601
+                {std::string("\xF0\x9F\x99\x8F"), std::string("\xF0\x9F\x99\x8F"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F64F
+
+                // Outside range
+                {std::string("\xF0\x9F\x97\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F5FF (before range)
+                {std::string("\xF0\x9F\x99\x90"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F650 (after range)
+                {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 (outside range)
+
+                // Incomplete sequences
+                {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete emoji
+                {std::string("\xF0\x9F"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},     // Very incomplete
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.sequence({p.chars(R"([\U0001F600-\U0001F64F])"), p.end()});
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    // Assert result type matches
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    // Assert matched text if success or need_more_input
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("mixed unicode ranges", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Match CJK
+                {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
+                {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
+
+                // Match emoticons
+                {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
+
+                // Match ASCII digits
+                {"5", "5", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Don't match outside any range
+                {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},
+                {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680
+
+                // Incomplete
+                {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+                {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.sequence({p.chars(R"([\u4E00-\u9FFF\U0001F600-\U0001F64F0-9])"), p.end()});
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    // Assert result type matches
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    // Assert matched text if success or need_more_input
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+    });
+
+    t.test("until parser", [](testing &t) {
+        t.test("ASCII delimiter with Unicode content", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // CJK characters before delimiter
+                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD</tag>"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Emoji before delimiter
+                {std::string("\xF0\x9F\x98\x80</tag>"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Mixed content
+                {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!</tag>"), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.until("</tag>");
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("incomplete UTF-8 at end", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Incomplete emoji at end, no delimiter
+                {std::string("content\xF0\x9F\x98"), std::string("content"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete CJK at end, no delimiter
+                {std::string("hello\xE4\xB8"), std::string("hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Complete content, no delimiter (should consume all valid UTF-8)
+                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.until("</tag>");
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("malformed UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Invalid UTF-8 bytes
+                {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Continuation byte without lead byte
+                {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Invalid continuation byte
+                {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.until("</tag>");
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+                });
+            }
+        });
+    });
+
+    t.test("json_string parser", [](testing &t) {
+        t.test("valid UTF-8 characters", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // ASCII only
+                {"Hello World\"", "Hello World", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // 2-byte UTF-8 (accented characters)
+                {std::string("Caf\xC3\xA9\""), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // 3-byte UTF-8 (CJK)
+                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // 4-byte UTF-8 (emoji)
+                {std::string("\xF0\x9F\x98\x80\""), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Mixed content
+                {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!\""), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.sequence({p.json_string_content(), p.literal("\"")});
+                    });
+
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start - 1);  // -1 to exclude closing quote
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("incomplete UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Incomplete 2-byte sequence
+                {std::string("Caf\xC3"), std::string("Caf"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete 3-byte sequence
+                {std::string("Hello\xE4\xB8"), std::string("Hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete 4-byte sequence
+                {std::string("Text\xF0\x9F\x98"), std::string("Text"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete at very start
+                {std::string("\xE4\xBD"), std::string(""), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.json_string_content();
+                    });
+
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("malformed UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Invalid UTF-8 bytes
+                {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Continuation byte without lead byte
+                {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Invalid continuation byte
+                {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Overlong encoding (security issue)
+                {std::string("\xC0\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.json_string_content();
+                    });
+
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+                });
+            }
+        });
+
+        t.test("escape sequences with UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Unicode escape sequence
+                {"Hello\\u0041\"", "Hello\\u0041", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Mix of UTF-8 and escape sequences
+                {std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Escaped quote in UTF-8 string
+                {std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.sequence({p.json_string_content(), p.literal("\"")});
+                    });
+
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start - 1);  // -1 to exclude closing quote
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+    });
+}
diff --git a/llama.cpp/tests/peg-parser/tests.h b/llama.cpp/tests/peg-parser/tests.h
new file mode 100644
index 0000000..4d3f4e9
--- /dev/null
+++ b/llama.cpp/tests/peg-parser/tests.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// Common includes for all test files
+#include <nlohmann/json.hpp>
+#include <string>
+#include <vector>
+
+#include "../testing.h"
+#include "peg-parser.h"
+#include "chat-peg-parser.h"
+#include "simple-tokenize.h"
+
+struct bench_tool_call {
+    std::string            id;
+    std::string            name;
+    nlohmann::ordered_json args;
+};
+
+// Test function declarations
+void test_basic(testing &t);
+void test_json_parser(testing &t);
+void test_gbnf_generation(testing &t);
+void test_unicode(testing &t);
+void test_json_serialization(testing &t);
diff --git a/llama.cpp/tests/run-json-schema-to-grammar.mjs b/llama.cpp/tests/run-json-schema-to-grammar.mjs
new file mode 100644
index 0000000..450c3dd
--- /dev/null
+++ b/llama.cpp/tests/run-json-schema-to-grammar.mjs
@@ -0,0 +1,10 @@
+import { readFileSync } from "fs"
+import { SchemaConverter } from "../tools/server/public_legacy/json-schema-to-grammar.mjs"
+
+const [, , file] = process.argv
+const url = `file://${file}`
+let schema = JSON.parse(readFileSync(file, "utf8"));
+const converter = new SchemaConverter({})
+schema = await converter.resolveRefs(schema, url)
+converter.visit(schema, '')
+console.log(converter.formatGrammar())
diff --git a/llama.cpp/tests/test-alloc.cpp b/llama.cpp/tests/test-alloc.cpp
new file mode 100644
index 0000000..95e09c9
--- /dev/null
+++ b/llama.cpp/tests/test-alloc.cpp
@@ -0,0 +1,608 @@
+#include <ggml-alloc.h>
+#include <ggml-backend-impl.h>
+#include <ggml-cpp.h>
+#include <ggml-impl.h>
+#include <ggml.h>
+
+#include <algorithm>
+#include <exception>
+#include <memory>
+#include <vector>
+
+//
+// dummy backend with configurable max_buffer_size, tracks allocations
+
+uint8_t * const alloc_base = (uint8_t *) 16;
+
+struct dummy_backend_context {
+    size_t max_buffer_size = 64;
+    size_t alignment       = 8;
+
+    ggml_backend_buffer_i              buffer_interface;
+    std::vector<ggml_backend_buffer_t> buffers;
+
+    size_t allocated_total() const {
+        size_t n = 0;
+        for (ggml_backend_buffer_t buf : buffers) {
+            n += ggml_backend_buffer_get_size(buf);
+        }
+        return n;
+    }
+};
+
+// ggml_backend_buffer_type interface
+
+static const char * dummy_backend_buffer_type_get_name(ggml_backend_buffer_type_t) {
+    return "dummy_buffer_type";
+}
+
+static ggml_backend_buffer_t dummy_backend_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    dummy_backend_context * ctx    = (dummy_backend_context *) buft->context;
+    ggml_backend_buffer_t & buffer = ctx->buffers.emplace_back();
+    buffer                         = ggml_backend_buffer_init(buft, ctx->buffer_interface, ctx, size);
+    return buffer;
+}
+
+static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
+    return ctx->alignment;
+}
+
+static size_t dummy_backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
+    return ctx->max_buffer_size;
+}
+
+static bool dummy_backend_buffer_type_is_host(ggml_backend_buffer_type_t) {
+    return true;
+}
+
+// ggml_backend_buffer interface
+
+static void dummy_backend_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buffer->context;
+
+    auto i = std::find(ctx->buffers.begin(), ctx->buffers.end(), buffer);
+    GGML_ASSERT(i != ctx->buffers.end());
+    ctx->buffers.erase(i);
+}
+
+static void * dummy_backend_buffer_get_base(ggml_backend_buffer_t) {
+    return alloc_base;
+}
+
+static ggml_status dummy_backend_buffer_init_tensor(ggml_backend_buffer_t, ggml_tensor *) {
+    return GGML_STATUS_SUCCESS;
+}
+
+static void dummy_backend_buffer_memset_tensor(ggml_backend_buffer_t, ggml_tensor *, uint8_t, size_t, size_t) {}
+
+static void dummy_backend_buffer_set_tensor(ggml_backend_buffer_t, ggml_tensor *, const void *, size_t, size_t) {}
+
+static void dummy_backend_buffer_get_tensor(ggml_backend_buffer_t, const ggml_tensor *, void *, size_t, size_t) {}
+
+static void dummy_backend_buffer_clear(ggml_backend_buffer_t, uint8_t) {}
+
+// dummy_backend (not really a full backend, just provides what gallocr needs)
+
+struct dummy_backend {
+    std::unique_ptr<dummy_backend_context> context;
+    ggml_backend_buffer_type               buffer_type;
+};
+
+static dummy_backend dummy_backend_init(size_t max_buffer_size, size_t alignment = 8) {
+    dummy_backend b{};
+    b.context                  = std::make_unique<dummy_backend_context>();
+    b.context->alignment       = alignment;
+    b.context->max_buffer_size = max_buffer_size;
+
+    b.context->buffer_interface.free_buffer   = dummy_backend_buffer_free_buffer;
+    b.context->buffer_interface.get_base      = dummy_backend_buffer_get_base;
+    b.context->buffer_interface.init_tensor   = dummy_backend_buffer_init_tensor;
+    b.context->buffer_interface.memset_tensor = dummy_backend_buffer_memset_tensor;
+    b.context->buffer_interface.set_tensor    = dummy_backend_buffer_set_tensor;
+    b.context->buffer_interface.get_tensor    = dummy_backend_buffer_get_tensor;
+    b.context->buffer_interface.clear         = dummy_backend_buffer_clear;
+
+    b.buffer_type.context             = b.context.get();
+    b.buffer_type.iface.get_name      = dummy_backend_buffer_type_get_name;
+    b.buffer_type.iface.alloc_buffer  = dummy_backend_buffer_type_alloc_buffer;
+    b.buffer_type.iface.get_alignment = dummy_backend_buffer_type_get_alignment;
+    b.buffer_type.iface.get_max_size  = dummy_backend_buffer_type_get_max_size;
+    b.buffer_type.iface.is_host       = dummy_backend_buffer_type_is_host;
+    return b;
+}
+
+//
+// test utilities
+
+struct test_context_with_graph {
+    ggml_context *   ctx;
+    ggml_cgraph *    graph;
+    ggml_context_ptr ctx_ptr;
+};
+
+static test_context_with_graph make_context() {
+    ggml_init_params params{};
+    params.mem_size = 48 * ggml_tensor_overhead() + ggml_graph_overhead();
+    params.no_alloc = true;
+
+    ggml_context *   ctx     = ggml_init(params);
+    ggml_context_ptr ctx_ptr = ggml_context_ptr(ctx);
+    ggml_cgraph *    graph   = ggml_new_graph(ctx);
+    return { ctx, graph, std::move(ctx_ptr) };
+}
+
+static ggml_tensor * make_input_1d(ggml_context * ctx, int64_t n_elements) {
+    ggml_tensor * t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+    ggml_set_input(t);
+    return t;
+}
+
+static ggml_tensor * make_input_with_size(ggml_context * ctx, size_t size_bytes) {
+    GGML_ASSERT(size_bytes % 4 == 0);
+    return make_input_1d(ctx, size_bytes / 4);
+}
+
+static void assign_names(ggml_context * ctx, const char * prefix = "x") {
+    int i = 0;
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
+        ggml_format_name(t, "%s%d", prefix, i++);
+    }
+}
+
+static int get_leaf_id(ggml_cgraph * graph, const char * tensor_name) {
+    for (int i = 0; i < graph->n_leafs; ++i) {
+        if (strncmp(graph->leafs[i]->name, tensor_name, GGML_MAX_NAME) == 0) {
+            return i;
+        }
+    }
+    fprintf(stderr, "leaf not found: %s\n", tensor_name);
+    return -1;
+}
+
+static int get_node_id(ggml_cgraph * graph, const char * tensor_name) {
+    for (int i = 0; i < graph->n_nodes; ++i) {
+        if (strncmp(graph->nodes[i]->name, tensor_name, GGML_MAX_NAME) == 0) {
+            return i;
+        }
+    }
+    fprintf(stderr, "node not found: %s", tensor_name);
+    return -1;
+}
+
+static ggml_gallocr_ptr allocate_graph(ggml_cgraph * graph, ggml_tensor * out, ggml_backend_buffer_type_t buft) {
+    ggml_set_output(out);
+    ggml_build_forward_expand(graph, out);
+
+    ggml_gallocr_ptr galloc = ggml_gallocr_ptr(ggml_gallocr_new(buft));
+    bool             result = ggml_gallocr_alloc_graph(galloc.get(), graph);
+    GGML_ASSERT(result);
+    return galloc;
+}
+
+//
+// correctness checks for result allocations
+
+static void check_all_allocated(ggml_cgraph * graph) {
+    for (int i = 0; i < ggml_graph_n_nodes(graph); ++i) {
+        ggml_tensor * t = ggml_graph_node(graph, i);
+        GGML_ASSERT(t->buffer != nullptr);
+        GGML_ASSERT(t->data != nullptr);
+    }
+}
+
+static void check_max_size(ggml_context * ctx) {
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
+        auto   buft     = ggml_backend_buffer_get_type(t->buffer);
+        size_t max_size = ggml_backend_buft_get_max_size(buft);
+        size_t offset   = (char *) t->data - (char *) ggml_backend_buffer_get_base(t->buffer);
+        GGML_ASSERT(t->data >= ggml_backend_buffer_get_base(t->buffer));
+        GGML_ASSERT((size_t) offset + ggml_nbytes(t) <= max_size);
+    }
+}
+
+static bool can_reuse_memory(ggml_cgraph * graph, int current_i, ggml_tensor * current, ggml_tensor * other) {
+    if (other->flags & GGML_TENSOR_FLAG_OUTPUT) {
+        return false;
+    }
+    // Check if `other` is still "alive", ie. an input to any node after the `current` op
+    for (int i = current_i; i < ggml_graph_n_nodes(graph); ++i) {
+        ggml_tensor * t = ggml_graph_node(graph, i);
+        for (int s = 0; s < GGML_MAX_SRC; s++) {
+            if (t == current && ggml_op_can_inplace(t->op)) {
+                continue;
+            }
+            if (t->src[s] == other) {
+                return false;
+            }
+            if (t->src[s] && t->src[s]->view_src == other) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+static bool memory_overlap(ggml_tensor * a, ggml_tensor * b) {
+    if (a->buffer != b->buffer) {
+        return false;
+    }
+    int64_t a0 = (int64_t) a->data;
+    int64_t a1 = a0 + ggml_nbytes(a);
+    int64_t b0 = (int64_t) b->data;
+    int64_t b1 = b0 + ggml_nbytes(b);
+    return a1 > b0 && b1 > a0;
+}
+
+static ggml_tensor * get_view_source(ggml_tensor * t) {
+    while (t->view_src) {
+        t = t->view_src;
+    }
+    return t;
+}
+
+static void check_no_overlap(ggml_cgraph * graph) {
+    for (int i = 0; i < ggml_graph_n_nodes(graph); ++i) {
+        for (int j = 0; j < i; ++j) {
+            ggml_tensor * t = ggml_graph_node(graph, i);
+            ggml_tensor * o = ggml_graph_node(graph, j);
+            GGML_ASSERT(t != o);
+
+            if (get_view_source(t) == get_view_source(o)) {
+                continue;
+            }
+            if (memory_overlap(t, o)) {
+                GGML_ASSERT(can_reuse_memory(graph, i, t, o));
+            }
+        }
+    }
+}
+
+//
+// test cases
+
+// Scenario where the first backend buffer is completely exhausted and there are further
+// tensors which require a second buffer
+static void test_max_size_too_many_tensors() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[7];
+    x[0] = make_input_with_size(ctx, 8);
+    x[1] = make_input_with_size(ctx, 8);
+    x[2] = make_input_with_size(ctx, 8);
+    x[3] = ggml_mul(ctx, x[0], x[1]);
+    x[4] = ggml_add(ctx, x[1], x[2]);
+    x[5] = ggml_add(ctx, x[3], x[0]);
+    x[6] = ggml_add(ctx, x[4], x[5]);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[6], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 16 + 16);
+}
+
+// Scenario where there is some space left in the first buffer, but not enough to accomodate
+// a larger tensor, so a second buffer is required
+static void test_max_size_tensor_too_large() {
+    dummy_backend backend      = dummy_backend_init(32);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[3];
+    x[0] = make_input_with_size(ctx, 16);    // chunk 0, [0 , 16)
+    x[1] = make_input_with_size(ctx, 8);     // chunk 0, [16, 24)
+    x[2] = ggml_concat(ctx, x[0], x[1], 0);  // chunk 1, [0 , 24)
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 32 + 24);
+}
+
+// Scenario where a single tensor exceeds the max buffer size - in this case the allocator
+// should try to create a bigger buffer anyway, and wait for the backend to throw an error.
+// Backends may report an artificially lower max size in some cases for compatibility reasons.
+static void test_tensor_larger_than_max_size() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[2];
+    x[0] = make_input_with_size(ctx, 24);
+    x[1] = ggml_scale(ctx, x[0], 2.0f);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[1], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() == 24);
+}
+
+// This test assumes a max of 16 buffer chunks, and tries to allocate tensors that would
+// require more. Expectation is that the last buffer should grow to fit everything,
+// leaving it to the backend to error out if it can't allocate that much.
+static void test_not_enough_chunks() {
+    const int max_chunks = 16;
+    const int max_size   = 8;
+
+    dummy_backend backend      = dummy_backend_init(max_size);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[max_chunks + 1];
+    for (int i = 0; i < max_chunks + 1; ++i) {
+        x[i] = make_input_with_size(ctx, max_size);
+    }
+    ggml_tensor * acc = x[0];
+    for (int i = 0; i < max_chunks; ++i) {
+        acc = ggml_add(ctx, acc, x[i + 1]);
+    }
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, acc, &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() > max_chunks * max_size);
+}
+
+// Fill up leftover unallocated space of a chunk after allocating a large tensor that
+// requires a new chunk.
+static void test_fill_leftover_space() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[4];
+    x[0] = make_input_with_size(ctx, 8);
+    x[1] = ggml_pad(ctx, x[0], 2, 0, 0, 0);
+    x[3] = ggml_mean(ctx, x[1]);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[3], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 12 + 16);
+}
+
+// Check that views don't require any extra memory
+static void test_view_inplace() {
+    dummy_backend backend      = dummy_backend_init(32);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[6];
+    x[0] = make_input_1d(ctx, 4);                // chunk 0, [0, 16)
+    x[1] = ggml_reshape_2d(ctx, x[0], 2, 2);     // view of x0
+    x[2] = ggml_permute(ctx, x[1], 1, 0, 2, 3);  // view of x0
+    x[3] = ggml_view_1d(ctx, x[2], 2, 4);        // view of x0
+    x[4] = make_input_1d(ctx, 2);                // chunk 0, [16, 24)
+    x[5] = ggml_add(ctx, x[3], x[4]);            // reuse (inplace add)
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[5], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 24);
+}
+
+static void test_reuse_and_free() {
+    dummy_backend backend      = dummy_backend_init(40);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[9];
+    x[0] = make_input_with_size(ctx, 24);
+    x[1] = make_input_with_size(ctx, 8);
+    x[2] = make_input_with_size(ctx, 8);
+    x[3] = ggml_add(ctx, x[1], x[2]);        // reuse, free x2
+    x[4] = ggml_pad(ctx, x[0], 2, 0, 0, 0);  // alloc new buffer, free x0
+    x[5] = ggml_scale(ctx, x[4], 2.0f);      // alloc from free block
+    x[6] = ggml_add(ctx, x[4], x[5]);        // reuse, free x5
+    x[7] = ggml_view_1d(ctx, x[6], 2, 8);    // view
+    x[8] = ggml_add(ctx, x[3], x[7]);        // reuse
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[8], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 40 + 32 + 32);
+}
+
+static void test_merge_free_block(size_t max_buffer_size) {
+    dummy_backend backend      = dummy_backend_init(max_buffer_size);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[9];
+    x[0] = make_input_with_size(ctx, 16);
+    x[1] = make_input_with_size(ctx, 16);
+    x[2] = make_input_with_size(ctx, 16);
+    x[3] = ggml_mean(ctx, x[0]);
+    x[4] = ggml_mean(ctx, x[1]);
+    x[5] = ggml_pad(ctx, x[2], 2, 0, 0, 0);
+    x[6] = ggml_add(ctx, x[3], x[4]);
+    x[7] = ggml_pad(ctx, x[6], 5, 0, 0, 0);
+    x[8] = ggml_add(ctx, x[5], x[7]);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[8], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 24);
+}
+
+// Check that previously allocated but freed memory is preferred over allocating
+// additional memory, even if the remaining space in a chunk would match tensor size better
+static void test_prefer_already_allocated_memory() {
+    dummy_backend backend      = dummy_backend_init(32, /*align*/ 4);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[3];
+    x[0] = make_input_with_size(ctx, 24);  // [24b][8b unused]
+    x[1] = ggml_mean(ctx, x[0]);           // [24b free][4b][4b unused]
+    x[2] = ggml_mean(ctx, x[1]);           // should be allocated in the 24b block
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() <= 28);
+}
+
+// test for allocating on multiple devices with some tensors in the graph
+// allocated externally (not by gallocr).
+static void test_multiple_buffer_types() {
+    dummy_backend backend_a = dummy_backend_init(32);
+    dummy_backend backend_b = dummy_backend_init(SIZE_MAX);
+
+    auto [ctx_a, _a, ctx_a_ptr] = make_context();
+    auto [ctx_b, _b, ctx_b_ptr] = make_context();
+    auto [ctx, graph, ctx_ptr]  = make_context();
+
+    ggml_tensor * a[2];
+    a[0] = make_input_with_size(ctx_a, 16);
+    a[1] = make_input_with_size(ctx_a, 16);
+    assign_names(ctx_a, "a");
+
+    ggml_tensor * b[2];
+    b[0] = make_input_with_size(ctx_b, 24);
+    b[1] = make_input_with_size(ctx_b, 4);
+    assign_names(ctx_b, "b");
+
+    ggml_tensor * x[9];
+    x[0] = make_input_with_size(ctx, 16);
+    x[1] = ggml_mul(ctx, x[0], a[0]);
+    x[2] = ggml_pad(ctx, x[1], 2, 0, 0, 0);
+    x[3] = ggml_mul(ctx, x[2], b[0]);
+    x[4] = ggml_mean(ctx, x[3]);
+    x[5] = ggml_add(ctx, x[4], b[1]);
+    x[6] = ggml_pad(ctx, x[5], 3, 0, 0, 0);
+    x[7] = ggml_add(ctx, x[6], a[1]);
+    x[8] = ggml_scale(ctx, x[7], 2.0f);
+    assign_names(ctx, "x");
+
+    ggml_backend_buffer_ptr    buf_a(ggml_backend_alloc_ctx_tensors_from_buft(ctx_a, &backend_a.buffer_type));
+    ggml_backend_buffer_ptr    buf_b(ggml_backend_alloc_ctx_tensors_from_buft(ctx_b, &backend_b.buffer_type));
+    ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
+
+    // assign buffer types manually to avoid extra complexity from backend scheduler
+    ggml_set_output(x[8]);
+    ggml_build_forward_expand(graph, x[8]);
+
+    GGML_ASSERT(graph->n_leafs == 5);
+    int leaf_buffer_ids[5];
+    leaf_buffer_ids[get_leaf_id(graph, "a0")] = 0;
+    leaf_buffer_ids[get_leaf_id(graph, "a1")] = 0;
+    leaf_buffer_ids[get_leaf_id(graph, "b0")] = 1;
+    leaf_buffer_ids[get_leaf_id(graph, "b1")] = 1;
+    leaf_buffer_ids[get_leaf_id(graph, "x0")] = 0;
+
+    GGML_ASSERT(graph->n_nodes == 8);
+    int node_buffer_ids[8];
+    node_buffer_ids[get_node_id(graph, "x1")] = 0;
+    node_buffer_ids[get_node_id(graph, "x2")] = 0;
+    node_buffer_ids[get_node_id(graph, "x3")] = 1;
+    node_buffer_ids[get_node_id(graph, "x4")] = 1;
+    node_buffer_ids[get_node_id(graph, "x5")] = 1;
+    node_buffer_ids[get_node_id(graph, "x6")] = 1;
+    node_buffer_ids[get_node_id(graph, "x7")] = 0;
+    node_buffer_ids[get_node_id(graph, "x8")] = 0;
+
+    ggml_gallocr_ptr galloc(ggml_gallocr_new_n(bufts, 2));
+    ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
+    ggml_gallocr_alloc_graph(galloc.get(), graph);
+
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend_a.context->allocated_total() <= 32 + 32 + 24);
+    GGML_ASSERT(backend_b.context->allocated_total() <= 32 + 24);
+}
+
+static void test_buffer_size_zero() {
+    dummy_backend backend_a    = dummy_backend_init(SIZE_MAX);
+    dummy_backend backend_b    = dummy_backend_init(SIZE_MAX);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[2];
+    x[0] = make_input_with_size(ctx, 16);
+    x[1] = ggml_scale(ctx, x[0], 2.0f);
+
+    ggml_set_output(x[1]);
+    ggml_build_forward_expand(graph, x[1]);
+
+    int leaf_buffer_ids[1] = { 0 };
+    int node_buffer_ids[1] = { 0 };
+
+    ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
+    ggml_gallocr_ptr           galloc   = ggml_gallocr_ptr(ggml_gallocr_new_n(bufts, 2));
+    bool                       res1     = ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
+    bool                       res2     = ggml_gallocr_alloc_graph(galloc.get(), graph);
+    GGML_ASSERT(res1 && res2);
+
+    check_all_allocated(graph);
+    GGML_ASSERT(backend_a.context->allocated_total() == 16);
+    GGML_ASSERT(backend_b.context->allocated_total() == 0);
+}
+
+// Test re-using gallocr for a different graph. The new graph has the same
+// total size, but one of the chunks is larger, so reallocation is required.
+static void test_reallocation() {
+    dummy_backend    backend = dummy_backend_init(32, /*align*/ 4);
+    ggml_gallocr_ptr galloc;
+    {
+        auto [ctx, graph, ctx_ptr] = make_context();
+        ggml_tensor * x[4];
+        x[0] = make_input_with_size(ctx, 24);
+        x[1] = make_input_with_size(ctx, 16);
+        x[2] = ggml_view_1d(ctx, x[0], 4, 0);
+        x[3] = ggml_add(ctx, x[2], x[1]);
+        assign_names(ctx);
+
+        galloc = allocate_graph(graph, x[3], &backend.buffer_type);
+        check_all_allocated(graph);
+        GGML_ASSERT(backend.context->allocated_total() == 40);
+    }
+    {
+        auto [ctx, graph, ctx_ptr] = make_context();
+        ggml_tensor * x[3];
+        x[0] = make_input_with_size(ctx, 20);
+        x[1] = make_input_with_size(ctx, 20);
+        x[2] = ggml_add(ctx, x[0], x[1]);
+        assign_names(ctx);
+        ggml_set_output(x[2]);
+        ggml_build_forward_expand(graph, x[2]);
+
+        bool result = ggml_gallocr_alloc_graph(galloc.get(), graph);
+        GGML_ASSERT(result);
+        check_all_allocated(graph);
+        GGML_ASSERT(backend.context->allocated_total() == 40);
+    }
+}
+
+static void run(const char * name, void (*f)()) {
+    printf("%s ", name);
+    fflush(stdout);
+    f();
+    printf("PASSED\n");
+}
+
+int main() {
+    run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
+    run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
+    run("test_tensor_larger_than_max_size", test_tensor_larger_than_max_size);
+    run("test_not_enough_chunks", test_not_enough_chunks);
+    run("test_fill_leftover_space", test_fill_leftover_space);
+    run("test_view_inplace", test_view_inplace);
+    run("test_reuse_and_free", test_reuse_and_free);
+    run("test_merge_free_block(32)", []() { test_merge_free_block(32); });
+    run("test_merge_free_block(SIZE_MAX)", []() { test_merge_free_block(SIZE_MAX); });
+    run("test_prefer_already_allocated_memory", test_prefer_already_allocated_memory);
+    run("test_multiple_buffer_types", test_multiple_buffer_types);
+    run("test_buffer_size_zero", test_buffer_size_zero);
+    run("test_reallocation", test_reallocation);
+    return 0;
+}
diff --git a/llama.cpp/tests/test-arg-parser.cpp b/llama.cpp/tests/test-arg-parser.cpp
new file mode 100644
index 0000000..67f8ca6
--- /dev/null
+++ b/llama.cpp/tests/test-arg-parser.cpp
@@ -0,0 +1,208 @@
+#include "arg.h"
+#include "common.h"
+#include "download.h"
+
+#include <string>
+#include <vector>
+#include <sstream>
+#include <unordered_set>
+
+#undef NDEBUG
+#include <cassert>
+
+int main(void) {
+    common_params params;
+
+    printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
+    for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
+        try {
+            auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
+            common_params_add_preset_options(ctx_arg.options);
+            std::unordered_set<std::string> seen_args;
+            std::unordered_set<std::string> seen_env_vars;
+            for (const auto & opt : ctx_arg.options) {
+                // check for args duplications
+                for (const auto & arg : opt.get_args()) {
+                    if (seen_args.find(arg) == seen_args.end()) {
+                        seen_args.insert(arg);
+                    } else {
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg.c_str());
+                        exit(1);
+                    }
+                }
+                // check for env var duplications
+                for (const auto & env : opt.get_env()) {
+                    if (seen_env_vars.find(env) == seen_env_vars.end()) {
+                        seen_env_vars.insert(env);
+                    } else {
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", env.c_str());
+                        exit(1);
+                    }
+                }
+
+                // ensure shorter argument precedes longer argument
+                if (opt.args.size() > 1) {
+                    const std::string first(opt.args.front());
+                    const std::string last(opt.args.back());
+
+                    if (first.length() > last.length()) {
+                        fprintf(stderr, "test-arg-parser: shorter argument should come before longer one: %s, %s\n",
+                                first.c_str(), last.c_str());
+                        assert(false);
+                    }
+                }
+
+                // same check for negated arguments
+                if (opt.args_neg.size() > 1) {
+                    const std::string first(opt.args_neg.front());
+                    const std::string last(opt.args_neg.back());
+
+                    if (first.length() > last.length()) {
+                        fprintf(stderr, "test-arg-parser: shorter negated argument should come before longer one: %s, %s\n",
+                                first.c_str(), last.c_str());
+                        assert(false);
+                    }
+                }
+            }
+        } catch (std::exception & e) {
+            printf("%s\n", e.what());
+            assert(false);
+        }
+    }
+
+    auto list_str_to_char = [](std::vector<std::string> & argv) -> std::vector<char *> {
+        std::vector<char *> res;
+        for (auto & arg : argv) {
+            res.push_back(const_cast<char *>(arg.data()));
+        }
+        return res;
+    };
+
+    std::vector<std::string> argv;
+
+    printf("test-arg-parser: test invalid usage\n\n");
+
+    // missing value
+    argv = {"binary_name", "-m"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    // wrong value (int)
+    argv = {"binary_name", "-ngl", "hello"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    // wrong value (enum)
+    argv = {"binary_name", "-sm", "hello"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
+    argv = {"binary_name", "--draft", "123"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING));
+
+    // negated arg
+    argv = {"binary_name", "--no-mmap"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+
+    printf("test-arg-parser: test valid usage\n\n");
+
+    argv = {"binary_name", "-m", "model_file.gguf"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.model.path == "model_file.gguf");
+
+    argv = {"binary_name", "-t", "1234"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.cpuparams.n_threads == 1234);
+
+    argv = {"binary_name", "--verbose"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.verbosity > 1);
+
+    argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.model.path == "abc.gguf");
+    assert(params.n_predict == 6789);
+    assert(params.n_batch == 9090);
+
+    // --draft cannot be used outside llama-speculative
+    argv = {"binary_name", "--draft", "123"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
+    assert(params.speculative.n_max == 123);
+
+    // multi-value args (CSV)
+    argv = {"binary_name", "--lora", "file1.gguf,\"file2,2.gguf\",\"file3\"\"3\"\".gguf\",file4\".gguf"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.lora_adapters.size() == 4);
+    assert(params.lora_adapters[0].path == "file1.gguf");
+    assert(params.lora_adapters[1].path == "file2,2.gguf");
+    assert(params.lora_adapters[2].path == "file3\"3\".gguf");
+    assert(params.lora_adapters[3].path == "file4\".gguf");
+
+// skip this part on windows, because setenv is not supported
+#ifdef _WIN32
+    printf("test-arg-parser: skip on windows build\n");
+#else
+    printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n");
+
+    setenv("LLAMA_ARG_THREADS", "blah", true);
+    argv = {"binary_name"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
+    setenv("LLAMA_ARG_THREADS", "1010", true);
+    argv = {"binary_name"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.model.path == "blah.gguf");
+    assert(params.cpuparams.n_threads == 1010);
+
+    printf("test-arg-parser: test negated environment variables\n\n");
+
+    setenv("LLAMA_ARG_MMAP", "0", true);
+    setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
+    argv = {"binary_name"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.use_mmap == false);
+    assert(params.no_perf == true);
+
+    printf("test-arg-parser: test environment variables being overwritten\n\n");
+
+    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
+    setenv("LLAMA_ARG_THREADS", "1010", true);
+    argv = {"binary_name", "-m", "overwritten.gguf"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.model.path == "overwritten.gguf");
+    assert(params.cpuparams.n_threads == 1010);
+#endif // _WIN32
+
+    printf("test-arg-parser: test download functions\n\n");
+    const char * GOOD_URL = "http://ggml.ai/";
+    const char * BAD_URL  = "http://ggml.ai/404";
+
+    {
+        printf("test-arg-parser: test good URL\n\n");
+        auto res = common_remote_get_content(GOOD_URL, {});
+        assert(res.first == 200);
+        assert(res.second.size() > 0);
+        std::string str(res.second.data(), res.second.size());
+        assert(str.find("llama.cpp") != std::string::npos);
+    }
+
+    {
+        printf("test-arg-parser: test bad URL\n\n");
+        auto res = common_remote_get_content(BAD_URL, {});
+        assert(res.first == 404);
+    }
+
+    {
+        printf("test-arg-parser: test max size error\n");
+        common_remote_params params;
+        params.max_size = 1;
+        try {
+            common_remote_get_content(GOOD_URL, params);
+            assert(false && "it should throw an error");
+        } catch (std::exception & e) {
+            printf("  expected error: %s\n\n", e.what());
+        }
+    }
+
+    printf("test-arg-parser: all tests OK\n\n");
+}
diff --git a/llama.cpp/tests/test-autorelease.cpp b/llama.cpp/tests/test-autorelease.cpp
new file mode 100644
index 0000000..ca87c56
--- /dev/null
+++ b/llama.cpp/tests/test-autorelease.cpp
@@ -0,0 +1,24 @@
+// ref: https://github.com/ggml-org/llama.cpp/issues/4952#issuecomment-1892864763
+
+#include <cstdio>
+#include <string>
+#include <thread>
+
+#include "llama.h"
+#include "get-model.h"
+
+// This creates a new context inside a pthread and then tries to exit cleanly.
+int main(int argc, char ** argv) {
+    auto * model_path = get_model_or_exit(argc, argv);
+
+    std::thread([&model_path]() {
+        llama_backend_init();
+        auto * model = llama_model_load_from_file(model_path, llama_model_default_params());
+        auto * ctx = llama_init_from_model(model, llama_context_default_params());
+        llama_free(ctx);
+        llama_model_free(model);
+        llama_backend_free();
+    }).join();
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-backend-ops.cpp b/llama.cpp/tests/test-backend-ops.cpp
new file mode 100644
index 0000000..ed99c24
--- /dev/null
+++ b/llama.cpp/tests/test-backend-ops.cpp
@@ -0,0 +1,8943 @@
+// This file defines tests for various GGML ops and backends.
+// For the forward pass it asserts that the results of multiple backends computing the same GGML ops are consistent.
+// For the backward pass it asserts that the gradients from backpropagation are consistent
+// with the gradients obtained via the method of finite differences ("grad" mode, this is optional).
+// It is also possible to check the performance ("perf" mode).
+//
+// this file has three sections: Section 1 does general setup, section 2 defines the GGML ops to be tested,
+// and section 3 defines which tests to run.
+// Quick start for adding a new GGML op: Go to section 2 and create a struct that inherits from test_case,
+// then go to section 3 and add an instantiation of your struct.
+
+
+// ##############################
+// ## Section 1: General Setup ##
+// ##############################
+
+
+#include <ggml.h>
+#include <ggml-alloc.h>
+#include <ggml-backend.h>
+#include <ggml-cpp.h>
+
+#include <algorithm>
+#include <array>
+#include <cfloat>
+#include <cinttypes>
+#include <cstdarg>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <future>
+#include <memory>
+#include <random>
+#include <regex>
+#include <set>
+#include <string>
+#include <string_view>
+#include <thread>
+#include <vector>
+#include <unordered_map>
+
+#ifdef __EMSCRIPTEN__
+#   define N_THREADS 1
+#else
+#   define N_THREADS std::thread::hardware_concurrency()
+#endif
+
+static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
+    size_t nels = ggml_nelements(tensor);
+    std::vector<float> data(nels);
+    {
+        // parallel initialization
+        static const size_t n_threads = N_THREADS;
+        // static RNG initialization (revisit if n_threads stops being constant)
+        static std::vector<std::default_random_engine> generators = []() {
+            std::random_device rd;
+            std::vector<std::default_random_engine> vec;
+            vec.reserve(n_threads);
+            //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
+            for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
+            return vec;
+        }();
+
+        auto init_thread = [&](size_t ith, size_t start, size_t end) {
+            std::uniform_real_distribution<float> distribution(min, max);
+            auto & gen = generators[ith];
+            for (size_t i = start; i < end; i++) {
+                data[i] = distribution(gen);
+            }
+        };
+
+        if (n_threads == 1) {
+            init_thread(0, 0, nels);
+        } else {
+            std::vector<std::future<void>> tasks;
+            tasks.reserve(n_threads);
+            for (size_t i = 0; i < n_threads; i++) {
+                size_t start =     i*nels/n_threads;
+                size_t end   = (i+1)*nels/n_threads;
+                tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
+            }
+            for (auto & t : tasks) {
+                t.get();
+            }
+        }
+    }
+
+    if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
+        ggml_backend_tensor_set(tensor, data.data(), 0, nels * sizeof(float));
+    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
+        GGML_ASSERT(nels % ggml_blck_size(tensor->type) == 0);
+
+         // dummy importance matrix
+        std::vector<float> imatrix(tensor->ne[0], 1.0f);
+        const float * im = imatrix.data();
+        if (!ggml_quantize_requires_imatrix(tensor->type)) {
+            // when the imatrix is optional, we want to test both quantization with and without imatrix
+            // use one of the random numbers to decide
+            if (data[0] > 0.5f*(min + max)) {
+                im = nullptr;
+            }
+        }
+
+        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, nels));
+        {
+            // parallel quantization by block
+            size_t blck_size = ggml_blck_size(tensor->type);
+            size_t n_blocks = nels / blck_size;
+
+            auto quantize_thread = [&](size_t start, size_t end) {
+                ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
+                    start * blck_size, end - start, blck_size, im);
+            };
+
+            const size_t min_blocks_per_thread = 1;
+            const size_t n_quant_threads = std::min<size_t>(std::max<size_t>(N_THREADS/2, 1),
+                                                            std::max<size_t>(1, n_blocks / min_blocks_per_thread));
+
+            if (n_quant_threads == 1) {
+                // single-threaded quantization: do all blocks in the current thread
+                quantize_thread(0, n_blocks);
+            } else {
+                std::vector<std::future<void>> tasks;
+                tasks.reserve(n_quant_threads);
+                for (size_t i = 0; i < n_quant_threads; i++) {
+                    size_t start =     i*n_blocks/n_quant_threads;
+                    size_t end   = (i+1)*n_blocks/n_quant_threads;
+                    tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
+                }
+                for (auto & t : tasks) {
+                    t.get();
+                }
+            }
+        }
+        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
+    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
+        // This is going to create some weird integers though.
+        ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
+    } else if (tensor->type == GGML_TYPE_I64) {
+        // Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
+        const size_t nbytes_half = ggml_nbytes(tensor)/2;
+        ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
+        ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+// generate an F16 mask where certain blocks are randomly masked with -INF value
+static void init_tensor_kq_mask(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
+    GGML_ASSERT(tensor->type == GGML_TYPE_F16);
+
+    GGML_TENSOR_LOCALS( int32_t, ne, tensor, ne);
+
+    std::vector<float>       data_f32(ne0*ne1*ne2*ne3);
+    std::vector<ggml_fp16_t> data_f16(ne0*ne1*ne2*ne3);
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(min, max);
+
+    for (size_t i = 0; i < data_f32.size(); i++) {
+        data_f32[i] = dis(gen);
+    }
+
+    // block size
+    const int blck0 = 128;
+    const int blck1 = 64;
+
+    // number of INF/zero blocks
+    const int n_inf_zero_blocks = 0.2*(ne0*ne1*ne2*ne3)/(blck0*blck1);
+
+    for (int b = 0; b < n_inf_zero_blocks; b++) {
+        const int p3 = (rd() % ne3);
+        const int p2 = (rd() % ne2);
+        const int p1 = (rd() % ne1);
+        const int p0 = (rd() % ne0);
+
+        bool inf = rd() & 1;
+
+        for (int i1 = 0; i1 < blck1 && p1 + i1 < ne1; i1++) {
+            const int idx = p3*ne2*ne1*ne0 + p2*ne1*ne0 + (p1 + i1)*ne0 + p0;
+
+            for (int i0 = 0; i0 < blck0 && p0 + i0 < ne0; i0++) {
+                data_f32[idx + i0] = inf ? -INFINITY : 0.0f;
+            }
+        }
+    }
+
+    ggml_fp32_to_fp16_row(data_f32.data(), data_f16.data(), ne0*ne1*ne2*ne3);
+
+    ggml_backend_tensor_set(tensor, data_f16.data(), 0, data_f16.size()*sizeof(ggml_fp16_t));
+}
+
+// generate a lower triangular matrix
+static void init_tensor_tril(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
+    GGML_ASSERT(tensor->type == GGML_TYPE_F32);
+    GGML_ASSERT(tensor->ne[0] == tensor->ne[1]);
+
+    GGML_TENSOR_LOCALS(int32_t, ne, tensor, ne);
+    GGML_TENSOR_LOCALS(size_t, nb, tensor, nb);
+
+    std::vector<float> data_f32(ne0*ne1*ne2*ne3);
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(min, max);
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                for (int64_t i0 = 0; i0 < ne0; i0++) {
+                    int64_t idx = (i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3) / sizeof(float);
+                    if (i0 <= i1) {
+                        data_f32[idx] = dis(gen);
+                    } else {
+                        data_f32[idx] = 0.0f;
+                    }
+                }
+            }
+        }
+    }
+
+    ggml_backend_tensor_set(tensor, data_f32.data(), 0, ggml_nbytes(tensor));
+}
+
+static std::vector<float> tensor_to_float(const ggml_tensor * t) {
+    std::vector<float> tv;
+    tv.reserve(ggml_nelements(t));
+
+    std::vector<uint8_t> buf(ggml_nbytes(t));
+    ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
+
+    const auto * tt = ggml_get_type_traits(t->type);
+    size_t bs = ggml_blck_size(t->type);
+    std::vector<float> vq(ggml_blck_size(t->type));
+    bool quantized = ggml_is_quantized(t->type);
+
+    // access elements by index to avoid gaps in views
+    for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
+        for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
+            for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
+                for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
+                    size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
+                    if (t->type == GGML_TYPE_F16) {
+                        tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
+                    } else if (t->type == GGML_TYPE_BF16) {
+                        tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
+                    } else if (t->type == GGML_TYPE_F32) {
+                        tv.push_back(*(float *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I64) {
+                        tv.push_back((float)*(int64_t *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I32) {
+                        tv.push_back((float)*(int32_t *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I16) {
+                        tv.push_back((float)*(int16_t *) &buf[i]);
+                    } else if (t->type == GGML_TYPE_I8) {
+                        tv.push_back((float)*(int8_t *) &buf[i]);
+                    } else if (quantized) {
+                        tt->to_float(&buf[i], vq.data(), bs);
+                        tv.insert(tv.end(), vq.begin(), vq.end());
+                    } else {
+                        GGML_ABORT("fatal error");
+                    }
+                }
+            }
+        }
+    }
+
+    return tv;
+}
+
+// normalized mean squared error = mse(a, b) / mse(a, 0)
+static double nmse(const float * a, const float * b, size_t n) {
+    double mse_a_b = 0.0;
+    double mse_a_0 = 0.0;
+
+    for (size_t i = 0; i < n; i++) {
+        float a_i = a[i];
+        float b_i = b[i];
+
+        mse_a_b += (a_i - b_i) * (a_i - b_i);
+        mse_a_0 += a_i * a_i;
+    }
+
+    return mse_a_b / mse_a_0;
+}
+
+// difference between 2 sets (Jaccard distance, 0 - no difference, 1 - no overlap)
+template <typename T>
+static double jdst(const T * a, const T * b, size_t n) {
+    std::unordered_map<T, size_t> set_a;
+    std::unordered_map<T, size_t> set_b;
+
+    for (size_t i = 0; i < n; ++i) {
+        set_a[a[i]]++;
+        set_b[b[i]]++;
+    }
+
+    size_t diff = 0;
+
+    for (const auto & p : set_a) {
+        const int64_t na = p.second;
+        const int64_t nb = set_b.find(p.first) != set_b.end() ? set_b.at(p.first) : 0;
+
+        diff += std::abs(na - nb);
+    }
+
+    for (const auto & p : set_b) {
+        if (set_a.find(p.first) == set_a.end()) {
+            diff += p.second;
+        }
+    }
+
+    return (double) diff / (2*n);
+}
+
+// maximum absolute asymmetry between a and b
+// asymmetry: (a - b) / (a + b)
+// This is more stable than relative error if one of the values fluctuates towards zero.
+// n: number of values to compare.
+// expected_vals: optional vector of expected values for a. If expected_vals is not empty, filter out all comparisons where
+//     a does not match any of the expected values. Needed for noncontinuous gradients where the numerical calculation can fail.
+static double mean_abs_asymm(const float * a, const float * b, const size_t n, const std::vector<float> & expected_vals) {
+    double sum = 0.0f;
+
+    size_t nvalid = 0;
+    for (size_t i = 0; i < n; i++) {
+        if (!expected_vals.empty()) {
+            bool matches_any = false;
+            for (const float & ev : expected_vals) {
+                if (fabsf(a[i] - ev) < 1e-3f) {
+                    matches_any = true;
+                    break;
+                }
+            }
+            if (!matches_any) {
+                continue;
+            }
+        }
+
+        const float asymm = (a[i] - b[i]) / (a[i] + b[i]);
+
+        sum += fabsf(asymm);
+        nvalid++;
+    }
+
+    return sum/nvalid;
+}
+
+// utils for printing the variables of the test cases
+
+static std::string var_to_str(const std::string & x) {
+    return x;
+}
+
+template<typename T>
+static std::string var_to_str(const T & x) {
+    return std::to_string(x);
+}
+
+template<typename T, size_t N>
+static std::string var_to_str(const T (&x)[N]) {
+    std::string s = "[";
+    for (size_t i = 0; i < N; i++) {
+        if (i > 0) {
+            s += ",";
+        }
+        s += var_to_str(x[i]);
+    }
+    s += "]";
+    return s;
+}
+
+template<typename T, size_t N>
+static std::string var_to_str(const std::array<T, N> & x) {
+    std::string s = "[";
+    for (size_t i = 0; i < N; i++) {
+        if (i > 0) {
+            s += ",";
+        }
+        s += var_to_str(x[i]);
+    }
+    s += "]";
+    return s;
+}
+
+static std::string var_to_str(ggml_type type) {
+    return ggml_type_name(type);
+}
+
+static std::string var_to_str(ggml_prec prec) {
+    return prec == GGML_PREC_F32 ? "f32" : "def";
+}
+
+static std::string var_to_str(ggml_op_pool pool) {
+    switch (pool) {
+        case GGML_OP_POOL_AVG:  return "avg";
+        case GGML_OP_POOL_MAX:  return "max";
+        default:                return std::to_string(pool);
+    }
+}
+
+static std::string var_to_str(ggml_scale_mode mode) {
+    std::string str;
+    switch (mode & 0xFF) {
+        case GGML_SCALE_MODE_NEAREST:  str = "nearest"; break;
+        case GGML_SCALE_MODE_BILINEAR: str = "bilinear"; break;
+        case GGML_SCALE_MODE_BICUBIC:  str = "bicubic"; break;
+        default:                       str = std::to_string(mode); break;
+    }
+    if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) {
+        str += "|align_corners";
+    }
+    if (mode & GGML_SCALE_FLAG_ANTIALIAS) {
+        str += "|antialias";
+    }
+    return str;
+}
+
+#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
+
+#define VARS_TO_STR1(a) VAR_TO_STR(a)
+#define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
+#define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
+#define VARS_TO_STR4(a, b, c, d) VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
+#define VARS_TO_STR5(a, b, c, d, e) VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
+#define VARS_TO_STR6(a, b, c, d, e, f) VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
+#define VARS_TO_STR7(a, b, c, d, e, f, g) VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
+#define VARS_TO_STR8(a, b, c, d, e, f, g, h) VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
+#define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
+#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
+#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
+#define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l)
+#define VARS_TO_STR13(a, b, c, d, e, f, g, h, i, j, k, l, m) VAR_TO_STR(a) + "," + VARS_TO_STR12(b, c, d, e, f, g, h, i, j, k, l, m)
+#define VARS_TO_STR14(a, b, c, d, e, f, g, h, i, j, k, l, m, n) VAR_TO_STR(a) + "," + VARS_TO_STR13(b, c, d, e, f, g, h, i, j, k, l, m, n)
+#define VARS_TO_STR15(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) VAR_TO_STR(a) + "," + VARS_TO_STR14(b, c, d, e, f, g, h, i, j, k, l, m, n, o)
+#define VARS_TO_STR16(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) VAR_TO_STR(a) + "," + VARS_TO_STR15(b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)
+
+#ifdef GGML_USE_SYCL
+static bool inline _isinf(float f) {
+    return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
+}
+#else
+static bool inline _isinf(float f) { return std::isinf(f); }
+#endif
+
+// accept FLT_MAX as infinity
+static bool isinf_or_max(float f) {
+    return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
+}
+
+static bool ggml_is_view_op(enum ggml_op op) {
+    return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
+}
+
+static bool backend_has_feature(ggml_backend_t backend, const char * feature_name) {
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+
+    auto get_features = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
+    if (!get_features) {
+        return false;
+    }
+
+    const ggml_backend_feature * features = get_features(reg);
+    if (!features) {
+        return false;
+    }
+
+    for (const ggml_backend_feature * f = features; f->name; ++f) {
+        if (strcmp(f->name, feature_name) == 0 && strcmp(f->value, "1") == 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
+enum test_mode {
+    MODE_TEST,
+    MODE_PERF,
+    MODE_GRAD,
+    MODE_SUPPORT,
+};
+
+// Output format support similar to llama-bench
+enum output_formats { CONSOLE, SQL, CSV };
+
+static const char * output_format_str(output_formats format) {
+    switch (format) {
+        case CONSOLE:
+            return "console";
+        case SQL:
+            return "sql";
+        case CSV:
+            return "csv";
+        default:
+            GGML_ABORT("invalid output format");
+    }
+}
+
+static bool output_format_from_str(const std::string & s, output_formats & format) {
+    if (s == "console") {
+        format = CONSOLE;
+    } else if (s == "sql") {
+        format = SQL;
+    } else if (s == "csv") {
+        format = CSV;
+    } else {
+        return false;
+    }
+    return true;
+}
+
+// Test result structure for SQL output
+struct test_result {
+    std::string test_time;
+    std::string build_commit;
+    std::string backend_name;
+    std::string op_name;
+    std::string op_params;
+    std::string test_mode;
+    bool        supported;
+    bool        passed;
+    std::string error_message;
+    double      time_us;
+    double      flops;
+    double      bandwidth_gb_s;
+    size_t      memory_kb;
+    int         n_runs;
+    std::string device_description;
+    std::string backend_reg_name;
+
+    test_result() {
+        // Initialize with default values
+        time_us        = 0.0;
+        flops          = 0.0;
+        bandwidth_gb_s = 0.0;
+        memory_kb      = 0;
+        n_runs         = 0;
+        supported      = false;
+        passed         = false;
+
+        // Set test time
+        time_t t = time(NULL);
+        char   buf[32];
+        std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
+        test_time = buf;
+
+        // Set build info
+        build_commit = ggml_commit();
+    }
+
+    test_result(const std::string & backend_name, const std::string & op_name, const std::string & op_params,
+                const std::string & test_mode, bool supported, bool passed, const std::string & error_message = "",
+                double time_us = 0.0, double flops = 0.0, double bandwidth_gb_s = 0.0, size_t memory_kb = 0,
+                int n_runs = 0, const std::string & device_description = "", const std::string & backend_reg_name = "") :
+        backend_name(backend_name),
+        op_name(op_name),
+        op_params(op_params),
+        test_mode(test_mode),
+        supported(supported),
+        passed(passed),
+        error_message(error_message),
+        time_us(time_us),
+        flops(flops),
+        bandwidth_gb_s(bandwidth_gb_s),
+        memory_kb(memory_kb),
+        n_runs(n_runs),
+        device_description(device_description),
+        backend_reg_name(backend_reg_name) {
+        // Set test time
+        time_t t = time(NULL);
+        char   buf[32];
+        std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
+        test_time = buf;
+
+        // Set build info
+        build_commit = ggml_commit();
+    }
+
+    static const std::vector<std::string> & get_fields() {
+        static const std::vector<std::string> fields = {
+            "test_time", "build_commit",  "backend_name", "op_name", "op_params",      "test_mode", "supported",
+            "passed",    "error_message", "time_us",      "flops",   "bandwidth_gb_s", "memory_kb", "n_runs",
+            "device_description", "backend_reg_name"
+        };
+        return fields;
+    }
+
+    enum field_type { STRING, BOOL, INT, FLOAT };
+
+    static field_type get_field_type(const std::string & field) {
+        if (field == "supported" || field == "passed") {
+            return BOOL;
+        }
+        if (field == "memory_kb" || field == "n_runs") {
+            return INT;
+        }
+        if (field == "time_us" || field == "flops" || field == "bandwidth_gb_s") {
+            return FLOAT;
+        }
+        return STRING;
+    }
+
+    std::vector<std::string> get_values() const {
+        return { test_time,
+                 build_commit,
+                 backend_name,
+                 op_name,
+                 op_params,
+                 test_mode,
+                 std::to_string(supported),
+                 std::to_string(passed),
+                 error_message,
+                 std::to_string(time_us),
+                 std::to_string(flops),
+                 std::to_string(bandwidth_gb_s),
+                 std::to_string(memory_kb),
+                 std::to_string(n_runs),
+                 device_description,
+                 backend_reg_name };
+    }
+};
+
+// Printer classes for different output formats
+enum class test_status_t { NOT_SUPPORTED, OK, FAIL, SKIPPED };
+
+struct test_operation_info {
+    std::string   op_name;
+    std::string   op_params;
+    std::string   backend_name;
+    test_status_t status = test_status_t::OK;
+    std::string   failure_reason;
+
+    // Additional information fields that were previously in separate structs
+    std::string error_component;
+    std::string error_details;
+
+    // Gradient info
+    int64_t     gradient_index = -1;
+    std::string gradient_param_name;
+    float       gradient_value = 0.0f;
+
+    // MAA error info
+    double maa_error     = 0.0;
+    double maa_threshold = 0.0;
+
+    // Flags for different types of information
+    bool has_error            = false;
+    bool has_gradient_info    = false;
+    bool has_maa_error        = false;
+    bool is_compare_failure   = false;
+    bool is_large_tensor_skip = false;
+
+    test_operation_info() = default;
+
+    test_operation_info(const std::string & op_name, const std::string & op_params, const std::string & backend_name,
+                        test_status_t status = test_status_t::OK, const std::string & failure_reason = "") :
+        op_name(op_name),
+        op_params(op_params),
+        backend_name(backend_name),
+        status(status),
+        failure_reason(failure_reason) {}
+
+    // Set error information
+    void set_error(const std::string & component, const std::string & details) {
+        has_error       = true;
+        error_component = component;
+        error_details   = details;
+        if (status == test_status_t::OK) {
+            status = test_status_t::FAIL;
+        }
+    }
+
+    // Set gradient information
+    void set_gradient_info(int64_t index, const std::string & param_name, float value) {
+        has_gradient_info   = true;
+        gradient_index      = index;
+        gradient_param_name = param_name;
+        gradient_value      = value;
+        if (status == test_status_t::OK) {
+            status = test_status_t::FAIL;
+        }
+    }
+
+    // Set MAA error information
+    void set_maa_error(double error, double threshold) {
+        has_maa_error = true;
+        maa_error     = error;
+        maa_threshold = threshold;
+        if (status == test_status_t::OK) {
+            status = test_status_t::FAIL;
+        }
+    }
+
+    // Set compare failure
+    void set_compare_failure() {
+        is_compare_failure = true;
+        if (status == test_status_t::OK) {
+            status = test_status_t::FAIL;
+        }
+    }
+
+    // Set large tensor skip
+    void set_large_tensor_skip() { is_large_tensor_skip = true; }
+};
+
+struct test_summary_info {
+    size_t tests_passed;
+    size_t tests_total;
+    bool   is_backend_summary = false;  // true for backend summary, false for test summary
+
+    test_summary_info() = default;
+
+    test_summary_info(size_t tests_passed, size_t tests_total, bool is_backend_summary = false) :
+        tests_passed(tests_passed),
+        tests_total(tests_total),
+        is_backend_summary(is_backend_summary) {}
+};
+
+struct testing_start_info {
+    size_t device_count;
+
+    testing_start_info() = default;
+
+    testing_start_info(size_t device_count) : device_count(device_count) {}
+};
+
+struct backend_init_info {
+    size_t      device_index;
+    size_t      total_devices;
+    std::string device_name;
+    bool        skipped = false;
+    std::string skip_reason;
+    std::string description;
+    size_t      memory_total_mb = 0;
+    size_t      memory_free_mb  = 0;
+    bool        has_memory_info = false;
+
+    backend_init_info() = default;
+
+    backend_init_info(size_t device_index, size_t total_devices, const std::string & device_name, bool skipped = false,
+                      const std::string & skip_reason = "", const std::string & description = "",
+                      size_t memory_total_mb = 0, size_t memory_free_mb = 0, bool has_memory_info = false) :
+        device_index(device_index),
+        total_devices(total_devices),
+        device_name(device_name),
+        skipped(skipped),
+        skip_reason(skip_reason),
+        description(description),
+        memory_total_mb(memory_total_mb),
+        memory_free_mb(memory_free_mb),
+        has_memory_info(has_memory_info) {}
+};
+
+struct backend_status_info {
+    std::string   backend_name;
+    test_status_t status;
+
+    backend_status_info() = default;
+
+    backend_status_info(const std::string & backend_name, test_status_t status) :
+        backend_name(backend_name),
+        status(status) {}
+};
+
+struct overall_summary_info {
+    size_t backends_passed;
+    size_t backends_total;
+    bool   all_passed;
+
+    overall_summary_info() = default;
+
+    overall_summary_info(size_t backends_passed, size_t backends_total, bool all_passed) :
+        backends_passed(backends_passed),
+        backends_total(backends_total),
+        all_passed(all_passed) {}
+};
+
+struct printer {
+    virtual ~printer() {}
+
+    FILE * fout = stdout;
+
+    virtual void print_header() {}
+
+    virtual void print_test_result(const test_result & result) = 0;
+
+    virtual void print_footer() {}
+
+    virtual void print_operation(const test_operation_info & info) { (void) info; }
+
+    virtual void print_summary(const test_summary_info & info) { (void) info; }
+
+    virtual void print_testing_start(const testing_start_info & info) { (void) info; }
+
+    virtual void print_backend_init(const backend_init_info & info) { (void) info; }
+
+    virtual void print_backend_status(const backend_status_info & info) { (void) info; }
+
+    virtual void print_overall_summary(const overall_summary_info & info) { (void) info; }
+
+    virtual void print_failed_tests(const std::vector<std::string> & failed_tests) { (void) failed_tests; }
+};
+
+struct console_printer : public printer {
+    void print_test_result(const test_result & result) override {
+        if (result.test_mode == "test") {
+            print_test_console(result);
+        } else if (result.test_mode == "perf") {
+            print_perf_console(result);
+        } else if (result.test_mode == "support") {
+            print_support_console(result);
+        }
+    }
+
+    void print_operation(const test_operation_info & info) override {
+        printf("  %s(%s): ", info.op_name.c_str(), info.op_params.c_str());
+        fflush(stdout);
+
+        // Handle large tensor skip first
+        if (info.is_large_tensor_skip) {
+            printf("skipping large tensors for speed \n");
+            return;
+        }
+
+        // Handle not supported status
+        if (info.status == test_status_t::NOT_SUPPORTED) {
+            if (!info.failure_reason.empty()) {
+                printf("not supported [%s]\n", info.failure_reason.c_str());
+            } else {
+                printf("not supported [%s]\n", info.backend_name.c_str());
+            }
+            return;
+        }
+
+        // Handle errors and additional information
+        if (info.has_error) {
+            if (info.error_component == "allocation") {
+                fprintf(stderr, "failed to allocate tensors [%s] ", info.backend_name.c_str());
+            } else if (info.error_component == "backend") {
+                fprintf(stderr, "  Failed to initialize %s backend\n", info.backend_name.c_str());
+            } else {
+                fprintf(stderr, "Error in %s: %s\n", info.error_component.c_str(), info.error_details.c_str());
+            }
+        }
+
+        // Handle gradient info
+        if (info.has_gradient_info) {
+            printf("[%s] nonfinite gradient at index %" PRId64 " (%s=%f) ", info.op_name.c_str(), info.gradient_index,
+                   info.gradient_param_name.c_str(), info.gradient_value);
+        }
+
+        // Handle MAA error
+        if (info.has_maa_error) {
+            printf("[%s] MAA = %.9f > %.9f ", info.op_name.c_str(), info.maa_error, info.maa_threshold);
+        }
+
+        // Handle compare failure
+        if (info.is_compare_failure) {
+            printf("compare failed ");
+        }
+
+        // Print final status
+        if (info.status == test_status_t::OK) {
+            printf("\033[1;32mOK\033[0m\n");
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+    }
+
+    void print_summary(const test_summary_info & info) override {
+        if (info.is_backend_summary) {
+            printf("%zu/%zu backends passed\n", info.tests_passed, info.tests_total);
+        } else {
+            printf("  %zu/%zu tests passed\n", info.tests_passed, info.tests_total);
+        }
+    }
+
+    void print_backend_status(const backend_status_info & info) override {
+        printf("  Backend %s: ", info.backend_name.c_str());
+        if (info.status == test_status_t::OK) {
+            printf("\033[1;32mOK\033[0m\n");
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+    }
+
+    void print_testing_start(const testing_start_info & info) override {
+        printf("Testing %zu devices\n\n", info.device_count);
+    }
+
+    void print_backend_init(const backend_init_info & info) override {
+        printf("Backend %zu/%zu: %s\n", info.device_index + 1, info.total_devices, info.device_name.c_str());
+
+        if (info.skipped) {
+            printf("  %s\n", info.skip_reason.c_str());
+            return;
+        }
+
+        if (!info.description.empty()) {
+            printf("  Device description: %s\n", info.description.c_str());
+        }
+
+        if (info.has_memory_info) {
+            printf("  Device memory: %zu MB (%zu MB free)\n", info.memory_total_mb, info.memory_free_mb);
+        }
+
+        printf("\n");
+    }
+
+    void print_overall_summary(const overall_summary_info & info) override {
+        printf("%zu/%zu backends passed\n", info.backends_passed, info.backends_total);
+        if (info.all_passed) {
+            printf("\033[1;32mOK\033[0m\n");
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+    }
+
+    void print_failed_tests(const std::vector<std::string> & failed_tests) override {
+        if (failed_tests.empty()) {
+            return;
+        }
+
+        printf("\nFailing tests:\n");
+        for (const auto & test_name : failed_tests) {
+            printf("  %s\n", test_name.c_str());
+        }
+    }
+
+  private:
+    void print_test_console(const test_result & result) {
+        printf("  %s(%s): ", result.op_name.c_str(), result.op_params.c_str());
+        fflush(stdout);
+
+        if (!result.supported) {
+            printf("not supported [%s] ", result.backend_name.c_str());
+            printf("\n");
+            return;
+        }
+
+        if (result.passed) {
+            printf("\033[1;32mOK\033[0m\n");
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+    }
+
+    void print_perf_console(const test_result & result) {
+        int len = printf("  %s(%s): ", result.op_name.c_str(), result.op_params.c_str());
+        fflush(stdout);
+
+        if (!result.supported) {
+            printf("not supported\n");
+            return;
+        }
+
+        // align while also leaving some margin for variations in parameters
+        int align = 8;
+        int last  = (len + align - 1) / align * align;
+        if (last - len < 5) {
+            last += align;
+        }
+        printf("%*s", last - len, "");
+
+        printf("    %8d runs - %8.2f us/run - ", result.n_runs, result.time_us);
+
+        if (result.flops > 0) {
+            auto format_flops = [](double flops) -> std::string {
+                char buf[256];
+                if (flops >= 1e12) {
+                    snprintf(buf, sizeof(buf), "%6.2f TFLOP", flops / 1e12);
+                } else if (flops >= 1e9) {
+                    snprintf(buf, sizeof(buf), "%6.2f GFLOP", flops / 1e9);
+                } else if (flops >= 1e6) {
+                    snprintf(buf, sizeof(buf), "%6.2f MFLOP", flops / 1e6);
+                } else {
+                    snprintf(buf, sizeof(buf), "%6.2f kFLOP", flops / 1e3);
+                }
+                return buf;
+            };
+            uint64_t op_flops_per_run = result.flops * result.time_us / 1e6;
+            printf("%s/run - \033[1;34m%sS\033[0m", format_flops(op_flops_per_run).c_str(),
+                   format_flops(result.flops).c_str());
+        } else {
+            printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m", result.memory_kb, result.bandwidth_gb_s);
+        }
+        printf("\n");
+    }
+
+    void print_support_console(const test_result & result) {
+        printf("  %s(%s): ", result.op_name.c_str(), result.op_params.c_str());
+        fflush(stdout);
+
+        if (result.supported) {
+            printf("\033[1;32mSUPPORTED\033[0m\n");
+        } else {
+            printf("\033[1;31mNOT SUPPORTED\033[0m\n");
+        }
+    }
+};
+
+struct sql_printer : public printer {
+    static std::string get_sql_field_type(const std::string & field) {
+        switch (test_result::get_field_type(field)) {
+            case test_result::STRING:
+                return "TEXT";
+            case test_result::BOOL:
+            case test_result::INT:
+                return "INTEGER";
+            case test_result::FLOAT:
+                return "REAL";
+            default:
+                GGML_ABORT("invalid field type");
+        }
+    }
+
+    void print_header() override {
+        std::vector<std::string> fields = test_result::get_fields();
+        fprintf(fout, "CREATE TABLE IF NOT EXISTS test_backend_ops (\n");
+        for (size_t i = 0; i < fields.size(); i++) {
+            fprintf(fout, "  %s %s%s\n", fields[i].c_str(), get_sql_field_type(fields[i]).c_str(),
+                    i < fields.size() - 1 ? "," : "");
+        }
+        fprintf(fout, ");\n\n");
+    }
+
+    void print_test_result(const test_result & result) override {
+        fprintf(fout, "INSERT INTO test_backend_ops (");
+        std::vector<std::string> fields = test_result::get_fields();
+        for (size_t i = 0; i < fields.size(); i++) {
+            fprintf(fout, "%s%s", fields[i].c_str(), i < fields.size() - 1 ? ", " : "");
+        }
+        fprintf(fout, ") VALUES (");
+        std::vector<std::string> values = result.get_values();
+        for (size_t i = 0; i < values.size(); i++) {
+            fprintf(fout, "'%s'%s", values[i].c_str(), i < values.size() - 1 ? ", " : "");
+        }
+        fprintf(fout, ");\n");
+    }
+};
+
+struct csv_printer : public printer {
+    void print_header() override {
+
+        std::vector<std::string> fields     = test_result::get_fields();
+        std::vector<std::string> fields_csv = get_fields_csv();
+        for (size_t i = 0; i < fields.size(); i++) {
+            if (std::find(std::begin(fields_csv), std::end(fields_csv), fields[i]) == std::end(fields_csv)) {
+                continue;
+            }
+            printf("\"%s\"%s", fields[i].c_str(), i < fields.size() - 1 ? "," : "");
+        }
+        printf("\n");
+    }
+
+    void print_test_result(const test_result & result) override {
+
+        std::vector<std::string> values     = result.get_values();
+        std::vector<std::string> fields     = test_result::get_fields();
+        std::vector<std::string> fields_csv = get_fields_csv();
+
+        for (size_t i = 0; i < values.size(); i++) {
+
+            if (std::find(std::begin(fields_csv), std::end(fields_csv), fields[i]) == std::end(fields_csv)) {
+                continue;
+            }
+
+            // Escape quotes and wrap in quotes for CSV
+            std::string escaped_value = values[i];
+            size_t pos = 0;
+            while ((pos = escaped_value.find("\"", pos)) != std::string::npos) {
+                escaped_value.replace(pos, 1, "\"\"");
+                pos += 2;
+            }
+            printf("\"%s\"%s", escaped_value.c_str(), i < values.size() - 1 ? "," : "");
+        }
+        printf("\n");
+    }
+
+    static std::vector<std::string> get_fields_csv() {
+        return {
+            "op_name",
+            "op_params",
+            "supported",
+            "error_message",
+            "test_mode",
+            "backend_reg_name",
+            "backend_name",
+        };
+    }
+
+};
+
+static std::unique_ptr<printer> create_printer(output_formats format) {
+    switch (format) {
+        case CONSOLE:
+            return std::make_unique<console_printer>();
+        case SQL:
+            return std::make_unique<sql_printer>();
+        case CSV:
+            return std::make_unique<csv_printer>();
+    }
+    GGML_ABORT("invalid output format");
+}
+
+struct test_case {
+    virtual ~test_case() {}
+
+    virtual std::string op_desc(ggml_tensor * t) {
+        return ggml_op_desc(t);
+    }
+
+    virtual std::string vars() {
+        return "";
+    }
+
+    virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
+
+    virtual double max_nmse_err() {
+        return 1e-7;
+    }
+
+    virtual double max_nmse_err(ggml_backend_t backend) {
+        GGML_UNUSED(backend);
+        return max_nmse_err();
+    }
+
+    virtual double max_maa_err() {
+        return 1e-4;
+    }
+
+    virtual double max_err() {
+        return max_nmse_err();
+    }
+
+    virtual double max_err(ggml_backend_t backend) {
+        return max_nmse_err(backend);
+    }
+
+    virtual double err(const float * a, const float * b, size_t n) {
+        return nmse(a, b, n);
+    }
+
+    virtual float grad_eps() {
+        return 1e-1f;
+    }
+
+    // If false, estimate gradient with 2 points, neglects 3rd order derivative and higher.
+    // If true,  estimate gradient with 4 points, neglects 5th order derivative and higher.
+    virtual bool grad_precise() {
+        return false;
+    }
+
+    // Skip gradient checks if total number of gradients to be checked is larger than this (to speed up the tests).
+    virtual int64_t grad_nmax() {
+        return 10000;
+    }
+
+    // No effect if empty.
+    // If not empty, skip all gradient checks where the numerical result does not match any of the values.
+    // Needed for dealing with noncontinuous gradients (e.g. ReLU) where estimation using finite differences is unreliable.
+    virtual std::vector<float> grad_expect() {
+        return {};
+    }
+
+    virtual void initialize_tensors(ggml_context * ctx) {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t);
+        }
+    }
+
+    virtual size_t op_size(ggml_tensor * t) {
+        size_t size = ggml_nbytes(t);
+        // add source tensors
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            if (t->src[i] != NULL) {
+                size += ggml_nbytes(t->src[i]);
+            }
+        }
+        return size;
+    }
+
+    virtual uint64_t op_flops(ggml_tensor * t) {
+        GGML_UNUSED(t);
+        return 0;
+    }
+
+    virtual bool run_whole_graph() { return false; }
+    virtual std::vector<ggml_tensor *> fusion_test_nodes() { return {}; }
+
+    ggml_cgraph * gf = nullptr;
+    ggml_cgraph * gb = nullptr;
+
+    static const int sentinel_size = 1024;
+
+    test_mode mode;
+
+    std::vector<ggml_tensor *> sentinels;
+
+    std::string current_op_name;
+
+    void add_sentinel(ggml_context * ctx) {
+        if (mode == MODE_PERF || mode == MODE_GRAD || mode == MODE_SUPPORT) {
+            return;
+        }
+        ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
+        ggml_format_name(sentinel, "sent_%zu", sentinels.size());
+        sentinels.push_back(sentinel);
+    }
+
+    // hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
+
+    ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
+        ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
+        add_sentinel(ctx);
+        return t;
+    }
+
+    ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
+        ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
+        add_sentinel(ctx);
+        return t;
+    }
+
+    ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
+        ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
+        add_sentinel(ctx);
+        return t;
+    }
+
+    ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
+        ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
+        add_sentinel(ctx);
+        return t;
+    }
+
+    ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+        ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
+        add_sentinel(ctx);
+        return t;
+    }
+
+    // Checks an op against the test filter, which is a comma separated list of OP names or specific variations
+    bool matches_filter(ggml_tensor * op, const char * op_names_filter) {
+        if (op_names_filter) {
+            const auto op_name = op_desc(op);
+            const auto op_full_name = op_name + "(" + vars() + ")";
+            std::string_view filter(op_names_filter);
+            while (!filter.empty()) {
+                auto comma_pos = filter.find_first_of(',');
+                const auto lparen_pos = filter.find_first_of('(');
+                if (lparen_pos < comma_pos) {
+                    auto rparen_pos = filter.find_first_of(')');
+                    comma_pos = filter.find_first_of(',', rparen_pos);
+                    const auto op_filter = filter.substr(0, comma_pos);
+                    if (op_filter == op_full_name) {
+                        return true;
+                    }
+                } else {
+                    const auto op_filter = filter.substr(0, comma_pos);
+                    if (op_filter == op_name) {
+                        return true;
+                    }
+                }
+                filter = comma_pos != std::string_view::npos ? filter.substr(comma_pos + 1) : "";
+            }
+            return false;
+        } else {
+            return true;
+        }
+    }
+
+    test_status_t eval(ggml_backend_t backend1,
+                       ggml_backend_t backend2,
+                       const char *   op_names_filter,
+                       printer *      output_printer) {
+        mode = MODE_TEST;
+
+        ggml_init_params params = {
+            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
+            /* .mem_base = */ NULL,
+            /* .no_alloc = */ true,
+        };
+        ggml_context * ctx = ggml_init(params);
+        GGML_ASSERT(ctx);
+
+        gf = ggml_new_graph(ctx);
+
+        // pre-graph sentinel
+        add_sentinel(ctx);
+
+        ggml_tensor * out = build_graph(ctx);
+        current_op_name   = op_desc(out);
+
+        if (!matches_filter(out, op_names_filter)) {
+            //printf("  %s: skipping\n", op_desc(out).c_str());
+            ggml_free(ctx);
+            return test_status_t::SKIPPED;
+        }
+
+        // check if the backends support the ops
+        bool supported = true;
+        for (ggml_backend_t backend : {backend1, backend2}) {
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+                if (!ggml_backend_supports_op(backend, t)) {
+                    supported = false;
+                    break;
+                }
+            }
+        }
+
+        if (!supported) {
+            // Create test result for unsupported operation
+            test_result result(ggml_backend_name(backend1), current_op_name, vars(), "test",
+                             false, false, "not supported");
+
+            if (output_printer) {
+                output_printer->print_test_result(result);
+            }
+
+            ggml_free(ctx);
+            return test_status_t::NOT_SUPPORTED;
+        }
+
+        // post-graph sentinel
+        add_sentinel(ctx);
+
+        // allocate
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
+
+        if (buf == NULL) {
+            printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
+            ggml_free(ctx);
+            return test_status_t::FAIL;
+        }
+
+        // build graph
+        ggml_build_forward_expand(gf, out);
+
+        // add sentinels as graph nodes so that they are checked in the callback
+        for (ggml_tensor * sentinel : sentinels) {
+            ggml_graph_add_node(gf, sentinel);
+        }
+
+        // randomize tensors
+        initialize_tensors(ctx);
+
+        // compare
+        struct callback_userdata {
+            bool   ok;
+            test_case * tc;
+            ggml_backend_t backend1;
+            ggml_backend_t backend2;
+        };
+
+        callback_userdata ud {
+            true,
+            this,
+            backend1,
+            backend2,
+        };
+
+        auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
+            callback_userdata * ud = (callback_userdata *) user_data;
+            const char * bn1 = ggml_backend_name(ud->backend1);
+            const char * bn2 = ggml_backend_name(ud->backend2);
+
+            if (t1->op == GGML_OP_NONE) {
+                // sentinels must be unchanged
+                std::vector<uint8_t> t1_data(ggml_nbytes(t1));
+                std::vector<uint8_t> t2_data(ggml_nbytes(t2));
+                ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
+                ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
+
+                if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
+                    printf("sentinel mismatch: %s ", t1->name);
+                    ud->ok = false;
+                    return true;
+                }
+            }
+
+            std::vector<float> f1 = tensor_to_float(t1);
+            std::vector<float> f2 = tensor_to_float(t2);
+
+            for (size_t i = 0; i < f1.size(); i++) {
+                // check for nans
+                if (std::isnan(f1[i]) || std::isnan(f2[i])) {
+                    printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
+                    ud->ok = false;
+                    return true;
+                }
+                // check for infs: both must be inf of the same sign, or both must be finite
+                if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
+                    if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
+                        if (std::signbit(f1[i]) != std::signbit(f2[i])) {
+                            printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
+                            ud->ok = false;
+                            return true;
+                        }
+                    } else {
+                        printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
+                        ud->ok = false;
+                        return true;
+                    }
+                }
+            }
+
+            double err = ud->tc->err(f1.data(), f2.data(), f1.size());
+            if (err > ud->tc->max_err(ud->backend1)) {
+                printf("[%s] ERR = %.9f > %.9f ", ggml_op_desc(t1), err, ud->tc->max_err(ud->backend1));
+                //for (int i = 0; i < (int) f1.size(); i++) {
+                //    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
+                //}
+                //printf("\n");
+                //exit(1);
+                ud->ok = false;
+            }
+            return true;
+
+            GGML_UNUSED(index);
+        };
+
+        std::vector<ggml_tensor *> fused_nodes_to_verify = fusion_test_nodes();
+        if (fused_nodes_to_verify.size() == 0 && run_whole_graph()) {
+            fused_nodes_to_verify.push_back(out);
+        }
+        const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud,
+                                                               run_whole_graph() ? fused_nodes_to_verify.data() : nullptr,
+                                                               fused_nodes_to_verify.size());
+
+        ggml_backend_buffer_free(buf);
+
+        ggml_free(ctx);
+
+        // Create test result
+        bool        test_passed = ud.ok && cmp_ok;
+        std::string error_msg   = test_passed ? "" : (!cmp_ok ? "compare failed" : "test failed");
+        test_result result(ggml_backend_name(backend1), current_op_name, vars(), "test", supported, test_passed,
+                           error_msg);
+
+        if (output_printer) {
+            output_printer->print_test_result(result);
+        }
+
+        return test_passed ? test_status_t::OK : test_status_t::FAIL;
+    }
+
+    bool eval_perf(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
+        mode = MODE_PERF;
+
+        static const size_t graph_nodes = 8192;
+
+        ggml_init_params params = {
+            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
+            /* .mem_base = */ NULL,
+            /* .no_alloc = */ true,
+        };
+        ggml_context_ptr ctx(ggml_init(params)); // smart ptr
+        GGML_ASSERT(ctx);
+
+        ggml_tensor * out             = build_graph(ctx.get());
+        current_op_name               = op_desc(out);
+        if (!matches_filter(out, op_names_filter)) {
+            //printf("  %s: skipping\n", op_desc(out).c_str());
+            return true;
+        }
+
+        if (!ggml_backend_supports_op(backend, out)) {
+            // Create test result for unsupported performance test
+            test_result result(ggml_backend_name(backend), current_op_name, vars(), "perf", false, false,
+                               "not supported");
+
+            output_printer->print_test_result(result);
+
+            return true;
+        }
+
+        // allocate
+        ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr
+
+        if (buf == NULL) {
+            printf("failed to allocate tensors\n");
+            return false;
+        }
+
+        // randomize tensors
+        initialize_tensors(ctx.get());
+
+        // build graph
+        ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), graph_nodes, false);
+        ggml_build_forward_expand(gf, out);
+
+        // warmup run
+        ggml_status status = ggml_backend_graph_compute(backend, gf);
+        if (status != GGML_STATUS_SUCCESS) {
+            fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+            return false;
+        }
+
+        // determine number of runs
+        int n_runs;
+        bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
+        if (op_flops(out) > 0) {
+            // based on flops
+            const uint64_t GFLOP = 1000 * 1000 * 1000;
+            const uint64_t target_flops_cpu =   8ULL * GFLOP;
+            const uint64_t target_flops_gpu = 100ULL * GFLOP;
+            uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
+            n_runs = (int)std::min<int64_t>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
+        } else {
+            // based on memory size
+            const size_t GB = 1ULL << 30;
+            const size_t target_size_cpu =  8 * GB;
+            const size_t target_size_gpu = 32 * GB;
+            size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
+            n_runs = (int)std::min<int64_t>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
+        }
+
+        // duplicate the op
+        for (int i = 1; i < n_runs; i++) {
+            ggml_graph_add_node(gf, out);
+        }
+
+        // calculate memory
+        size_t mem = n_runs * op_size(out);
+        auto tensor_op_size = [](ggml_tensor * t) {
+            size_t size = ggml_nbytes(t);
+            // add source tensors
+            for (int i = 0; i < GGML_MAX_SRC; i++) {
+                if (t->src[i] != NULL) {
+                    size += ggml_nbytes(t->src[i]);
+                }
+            }
+            return size;
+        };
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
+                continue;
+            }
+            mem += tensor_op_size(ggml_graph_node(gf, i));
+        }
+
+        // run
+        int64_t total_time_us = 0;
+        int64_t total_mem = 0;
+        int total_runs = 0;
+        do {
+            int64_t start_time = ggml_time_us();
+            ggml_status status = ggml_backend_graph_compute(backend, gf);
+            if (status != GGML_STATUS_SUCCESS) {
+                fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+                return false;
+            }
+            int64_t end_time = ggml_time_us();
+
+            total_time_us += end_time - start_time;
+            total_mem += mem;
+            total_runs += n_runs;
+        } while (total_time_us < 1000*1000); // run for at least 1 second
+
+        // Create test result
+        double avg_time_us      = (double) total_time_us / total_runs;
+        double calculated_flops = (op_flops(out) > 0) ? (op_flops(out) * total_runs) / (total_time_us / 1e6) : 0.0;
+        double calculated_bandwidth =
+            (op_flops(out) == 0) ? total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0 : 0.0;
+        size_t calculated_memory_kb = op_size(out) / 1024;
+
+        test_result result(ggml_backend_name(backend), current_op_name, vars(), "perf", true, true, "", avg_time_us,
+                           calculated_flops, calculated_bandwidth, calculated_memory_kb, total_runs);
+
+        if (output_printer) {
+            output_printer->print_test_result(result);
+        }
+
+        return true;
+    }
+
+    bool eval_support(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
+        mode = MODE_SUPPORT;
+
+        static const size_t graph_nodes = 8192;
+
+        ggml_init_params params = {
+            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
+            /* .mem_base = */ NULL,
+            /* .no_alloc = */ true,
+        };
+        ggml_context_ptr ctx(ggml_init(params)); // smart ptr
+        GGML_ASSERT(ctx);
+
+        gf = ggml_new_graph_custom(ctx.get(), graph_nodes, false);
+
+        ggml_tensor * out = build_graph(ctx.get());
+        current_op_name   = op_desc(out);
+
+        if (!matches_filter(out, op_names_filter)) {
+            return true;
+        }
+
+        bool supported = ggml_backend_supports_op(backend, out);
+
+        std::string device_desc = ggml_backend_dev_description(ggml_backend_get_device(backend));
+        std::string backend_reg_name = ggml_backend_reg_name(ggml_backend_dev_backend_reg(ggml_backend_get_device(backend)));
+
+        test_result result(ggml_backend_name(backend), current_op_name, vars(), "support", supported, supported,
+                           supported ? "yes" : "no", 0.0, 0.0, 0.0, 0, 0, device_desc, backend_reg_name);
+
+        output_printer->print_test_result(result);
+
+        return true;
+    }
+
+    bool eval_grad(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
+        mode = MODE_GRAD;
+        const std::vector<float> expect = grad_expect();
+
+        ggml_init_params params = {
+            /* .mem_size = */ ggml_tensor_overhead()*128 + 2*ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, true),
+            /* .mem_base = */ NULL,
+            /* .no_alloc = */ true,
+        };
+        ggml_context_ptr ctx(ggml_init(params)); // smart ptr
+        GGML_ASSERT(ctx);
+
+        gf = ggml_new_graph_custom(ctx.get(), GGML_DEFAULT_GRAPH_SIZE, true);
+        gb = ggml_new_graph_custom(ctx.get(), GGML_DEFAULT_GRAPH_SIZE, true);
+
+        ggml_tensor * out = build_graph(ctx.get());
+
+        if (!matches_filter(out, op_names_filter) || out->op == GGML_OP_OPT_STEP_ADAMW) {
+            return true;
+        }
+
+        if (out->type != GGML_TYPE_F32) {
+            output_printer->print_operation(test_operation_info(op_desc(out), vars(), ggml_backend_name(backend),
+                                                                test_status_t::NOT_SUPPORTED,
+                                                                out->name + std::string("->type != FP32")));
+            return true;
+        }
+
+        // Print operation info first
+        output_printer->print_operation(test_operation_info(op_desc(out), vars(), ggml_backend_name(backend)));
+
+        // check if the backend supports the ops
+        bool        supported  = true;
+        bool        any_params = false;
+        std::string failure_reason;
+
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) {
+            if (!ggml_backend_supports_op(backend, t)) {
+                supported      = false;
+                failure_reason = ggml_backend_name(backend);
+                break;
+            }
+            if ((t->flags & GGML_TENSOR_FLAG_PARAM)) {
+                any_params = true;
+                if (t->type != GGML_TYPE_F32) {
+                    supported      = false;
+                    failure_reason = std::string(t->name) + "->type != FP32";
+                    break;
+                }
+            }
+        }
+        if (!any_params) {
+            supported      = false;
+            failure_reason = op_desc(out);
+        }
+
+        if (!supported) {
+            output_printer->print_operation(test_operation_info(op_desc(out), vars(), ggml_backend_name(backend),
+                                                                test_status_t::NOT_SUPPORTED, failure_reason));
+            return true;
+        }
+
+        int64_t ngrads = 0;
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) {
+            if (t->flags & GGML_TENSOR_FLAG_PARAM) {
+                ngrads += ggml_nelements(t);
+            }
+        }
+        if (ngrads > grad_nmax()) {
+            test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend));
+            info.set_large_tensor_skip();
+            output_printer->print_operation(info);
+            return true;
+        }
+
+
+        if (!ggml_is_scalar(out)) {
+            out = ggml_sum(ctx.get(), out);
+            ggml_set_name(out, "sum_of_out");
+        }
+        ggml_set_loss(out);
+
+        ggml_build_forward_expand(gf, out);
+        ggml_graph_cpy(gf, gb);
+        ggml_build_backward_expand(ctx.get(), gb, nullptr);
+        if (expect.size() != 1 || expect[0] != 0.0f) {
+            GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) {
+                GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || ggml_graph_get_grad(gb, t)->op != GGML_OP_NONE);
+            }
+        }
+
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) {
+            if (!ggml_backend_supports_op(backend, t)) {
+                output_printer->print_operation(test_operation_info(op_desc(out), vars(), ggml_backend_name(backend),
+                                                                    test_status_t::NOT_SUPPORTED,
+                                                                    ggml_backend_name(backend)));
+                supported = false;
+                break;
+            }
+            if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) {
+                output_printer->print_operation(test_operation_info(op_desc(out), vars(), ggml_backend_name(backend),
+                                                                    test_status_t::NOT_SUPPORTED,
+                                                                    std::string(t->name) + "->type != FP32"));
+                supported = false;
+                break;
+            }
+        }
+        if (!supported) {
+            return true;
+        }
+
+        // allocate
+        ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr
+        if (buf == NULL) {
+            test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend));
+            info.set_error("allocation", "");
+            output_printer->print_operation(info);
+            return false;
+        }
+
+        initialize_tensors(ctx.get()); // Randomizes all tensors (including gradients).
+        ggml_graph_reset(gb);    // Sets gradients to 1 if loss, 0 otherwise.
+
+        ggml_status status = ggml_backend_graph_compute(backend, gf);
+        if (status != GGML_STATUS_SUCCESS) {
+            fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+            return false;
+        }
+        status = ggml_backend_graph_compute(backend, gb);
+        if (status != GGML_STATUS_SUCCESS) {
+            fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+            return false;
+        }
+
+        bool ok = true;
+        for (struct ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
+            if (!(t->flags & GGML_TENSOR_FLAG_PARAM)) {
+                continue;
+            }
+
+            const char * bn = ggml_backend_name(backend);
+            const int64_t ne = ggml_nelements(t);
+
+            std::vector<float> ga;
+            struct ggml_tensor * grad = ggml_graph_get_grad(gb, t);
+            if (grad) {
+                ga = tensor_to_float(grad);
+            } else {
+                ga.resize(ne); // default value is 0.0f
+            }
+
+            for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
+                // check for nans
+                if (!std::isfinite(ga[i])) {
+                    test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend));
+                    info.set_gradient_info(i, bn, ga[i]);
+                    output_printer->print_operation(info);
+                    ok = false;
+                    break;
+                }
+            }
+            if (!ok) {
+                break;
+            }
+
+            std::vector<float> gn(ne); // gradient numeric
+            GGML_ASSERT(ga.size() == gn.size());
+
+            std::vector<float> x0 = tensor_to_float(t); // original t data
+            GGML_ASSERT(ggml_is_scalar(out));
+            GGML_ASSERT(out->type == GGML_TYPE_F32);
+
+            const float eps = grad_eps();
+            for (int64_t i = 0; i < ne; ++i) {
+                const float xiu  = x0[i] + 1.0f*eps; // x, index i, up
+                const float xiuh = x0[i] + 0.5f*eps; // x, index i, up half
+                const float xidh = x0[i] - 0.5f*eps; // x, index i, down half
+                const float xid  = x0[i] - 1.0f*eps; // x, index i, down
+
+                float fu, fuh, fdh, fd; // output values for xiu, xiuh, xid, xidh
+
+                ggml_backend_tensor_set(t, &xiu, i*sizeof(float), sizeof(float));
+                status = ggml_backend_graph_compute(backend, gf);
+                if (status != GGML_STATUS_SUCCESS) {
+                    fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+                    return false;
+                }
+                ggml_backend_tensor_get(out, &fu, 0, ggml_nbytes(out));
+
+                ggml_backend_tensor_set(t, &xid, i*sizeof(float), sizeof(float));
+                status = ggml_backend_graph_compute(backend, gf);
+                if (status != GGML_STATUS_SUCCESS) {
+                    fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+                    return false;
+                }
+                ggml_backend_tensor_get(out, &fd, 0, ggml_nbytes(out));
+
+                if (grad_precise()) {
+                    ggml_backend_tensor_set(t, &xiuh, i*sizeof(float), sizeof(float));
+                    status = ggml_backend_graph_compute(backend, gf);
+                    if (status != GGML_STATUS_SUCCESS) {
+                        fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+                        return false;
+                    }
+                    ggml_backend_tensor_get(out, &fuh, 0, ggml_nbytes(out));
+
+                    ggml_backend_tensor_set(t, &xidh, i*sizeof(float), sizeof(float));
+                    status = ggml_backend_graph_compute(backend, gf);
+                    if (status != GGML_STATUS_SUCCESS) {
+                        fprintf(stderr, "%s: ggml_backend_graph_compute failed. status=%s \n", __func__, ggml_status_to_string(status));
+                        return false;
+                    }
+                    ggml_backend_tensor_get(out, &fdh, 0, ggml_nbytes(out));
+
+                    gn[i] = (8.0*(double)fuh + (double)fd - (8.0*(double)fdh + (double)fu)) / (6.0*(double)eps);
+                } else {
+                    gn[i] = (fu - fd) / (2.0f*eps);
+                }
+
+                ggml_backend_tensor_set(t, x0.data(), 0, ggml_nbytes(t));
+            }
+
+            const double err = mean_abs_asymm(gn.data(), ga.data(), gn.size(), expect);
+            if (err > max_maa_err()) {
+                test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend));
+                info.set_maa_error(err, max_maa_err());
+                output_printer->print_operation(info);
+                ok = false;
+                break;
+            }
+            if (!ok) {
+                break;
+            }
+        }
+
+        // Create final test result
+        test_operation_info final_info(op_desc(out), vars(), ggml_backend_name(backend));
+        if (!ok) {
+            final_info.set_compare_failure();
+        }
+        final_info.status = ok ? test_status_t::OK : test_status_t::FAIL;
+        output_printer->print_operation(final_info);
+
+        if (ok) {
+            return true;
+        }
+
+        return false;
+    }
+};
+
+
+// ###################################
+// ## Section 2: GGML Op Defintions ##
+// ###################################
+
+
+// The following is an example showing the bare minimum for creating a test for a GGML op.
+
+// GGML_OP_EXAMPLE
+struct test_example : public test_case {
+    // Always define these 2 or variants thereof:
+    const ggml_type type; // The type of the input tensors.
+    const std::array<int64_t, 4> ne; // The shape of the input tensors.
+    // For some ops it's necessary to define multiple types or shapes for the inputs.
+    // Or they may need additional parameters.
+
+    // Put all parameters needed to fully define the test into one of the VARS_TO_STR macros.
+    // In most cases these are just the properties of the struct that you defined above.
+    // This is needed for info prints.
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    // Define a constructor for the struct.
+    // In most cases it will be sufficient to have the same arguments as the struct has properties
+    // and just use initializer lists.
+    test_example(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    // Define how a simple GGML compute graph can be constructed for the new GGML op.
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // Step 1: create input tensors that don't depend on any other tensors:
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a"); // Setting names is optional but it's useful for debugging.
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        // Step 2: use the op that you want to test in the GGML compute graph.
+        ggml_tensor * out = ggml_add(ctx, a, b); // For this example we're just doing a simple addition.
+        ggml_set_name(out, "out");
+
+        // Step 3: return the output tensor.
+        return out;
+    }
+    // In order to also check the gradients for your op, add calls like ggml_set_param(a)
+    // immediately after you create the tensors.
+    // This is optional and only makes sense if a backward pass has actually been implemented for the new op.
+};
+
+
+// GGML_OP_UNARY
+struct test_unary : public test_case {
+    const ggml_unary_op op;
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    int v; // view (1 : non-contiguous a)
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne_a, v);
+    }
+
+    test_unary(ggml_unary_op op,
+            ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
+            int v = 0)
+        : op(op), type(type), ne_a(ne_a), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const bool grad_supported = op == GGML_UNARY_OP_ABS || op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_NEG ||
+            op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU ||
+            op == GGML_UNARY_OP_EXPM1 || op == GGML_UNARY_OP_SOFTPLUS;
+
+        ggml_tensor * a;
+        if (v & 1) {
+            auto ne = ne_a;
+            ne[0] *= 3;
+            ne[1] *= 2;
+            ne[2] *= 5;
+            ne[3] *= 4;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            if (grad_supported) {
+                ggml_set_param(a);
+            }
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            if (grad_supported) {
+                ggml_set_param(a);
+            }
+            ggml_set_name(a, "a");
+        }
+
+        ggml_tensor * out = ggml_unary(ctx, a, op);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // test extended range of values to check for NaNs in GELU
+            init_tensor_uniform(t, -150.f, 150.f);
+        }
+    }
+
+    float grad_eps() override {
+        return 15.0f;
+    }
+
+    std::vector<float> grad_expect() override {
+        if (op == GGML_UNARY_OP_ABS) {
+            return {-1.0f, 1.0f};
+        }
+        if (op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_STEP) {
+            return {0.0f};
+        }
+        if (op == GGML_UNARY_OP_RELU) {
+            return {0.0f, 1.0f};
+        }
+        return {};
+    }
+
+};
+
+// GGML_OP_GLU
+struct test_glu : public test_case {
+    const ggml_glu_op op;
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    int v; // view (1 : non-contiguous a)
+    bool swapped;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne_a, v, swapped);
+    }
+
+    test_glu(ggml_glu_op op,
+            ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
+            int v = 0,
+            bool swapped = false)
+        : op(op), type(type), ne_a(ne_a), v(v), swapped(swapped) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            ggml_set_name(a, "a");
+        }
+
+        ggml_tensor * out = ggml_glu(ctx, a, op, swapped);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // test extended range of values to check for NaNs in GELU
+            init_tensor_uniform(t, -150.f, 150.f);
+        }
+    }
+};
+
+struct test_glu_split : public test_case {
+    const ggml_glu_op op;
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    int v; // view (1 : non-contiguous a)
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne_a, v) + ",split";
+    }
+
+    test_glu_split(ggml_glu_op op,
+            ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
+            int v = 0)
+        : op(op), type(type), ne_a(ne_a), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a;
+        ggml_tensor * b;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_param(a);
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+
+            b = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_param(b);
+            ggml_set_name(b, "b");
+
+            b = ggml_view_4d(ctx, b, ne_a[0], ne_a[1], ne_a[2], ne_a[3], b->nb[1], b->nb[2], b->nb[3], 0);
+            ggml_set_name(a, "view_of_b");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            ggml_set_param(a);
+            ggml_set_name(a, "a");
+
+            b = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            ggml_set_param(b);
+            ggml_set_name(b, "b");
+        }
+
+        ggml_tensor * out = ggml_glu_split(ctx, a, b, op);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // test extended range of values to check for NaNs in GELU
+            init_tensor_uniform(t, -150.f, 150.f);
+        }
+    }
+};
+
+struct test_swiglu_oai : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    int v; // view (1 : non-contiguous a)
+    float alpha;
+    float limit;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, ne_a, v, alpha, limit);
+    }
+
+    test_swiglu_oai(ggml_type type = GGML_TYPE_F32,
+                    std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
+                    int v = 0,
+                    float alpha = 1.702f,
+                    float limit = 7.0f)
+        : type(type), ne_a(ne_a), v(v), alpha(alpha), limit(limit) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a;
+        ggml_tensor * b;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_param(a);
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+
+            b = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_param(b);
+            ggml_set_name(b, "b");
+
+            b = ggml_view_4d(ctx, b, ne_a[0], ne_a[1], ne_a[2], ne_a[3], b->nb[1], b->nb[2], b->nb[3], 0);
+            ggml_set_name(a, "view_of_b");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            ggml_set_param(a);
+            ggml_set_name(a, "a");
+
+            b = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            ggml_set_param(b);
+            ggml_set_name(b, "b");
+        }
+
+        ggml_tensor * out = ggml_swiglu_oai(ctx, a, b, alpha, limit);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // test extended range of values to check for NaNs in GELU
+            init_tensor_uniform(t, -150.f, 150.f);
+        }
+    }
+};
+
+// GGML_OP_GET_ROWS
+struct test_get_rows : public test_case {
+    const ggml_type type;
+    const int n; // cols
+    const int m; // rows
+    const int r; // rows to get
+    const int be1; // batch size
+    const int be2; // batch size
+    const bool v; // view (non-contiguous src1)
+
+    std::string vars() override {
+        return VARS_TO_STR7(type, n, m, r, be1, be2, v);
+    }
+
+    test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int be1 = 1, int be2 = 1, bool v = false)
+        : type(type), n(n), m(m), r(r), be1(be1), be2(be2), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * in = ggml_new_tensor_4d(ctx, type, n, m, be1, be2);
+        ggml_set_name(in, "in");
+
+        ggml_tensor * rows = ggml_new_tensor_3d(ctx, GGML_TYPE_I32, r, be1, be2);
+        ggml_set_name(rows, "rows");
+        if (v) {
+            rows = ggml_view_3d(ctx, rows, r/2, be1, be2, rows->nb[1], rows->nb[2], 0);
+            ggml_set_name(rows, "view_of_rows");
+        }
+
+        const bool grad_supported = ggml_is_matrix(in) && ggml_is_vector(rows);
+        if (grad_supported) {
+            ggml_set_param(in);
+            // rows is a constant input -> no gradients
+        }
+
+        ggml_tensor * out = ggml_get_rows(ctx, in, rows);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) { continue; }
+                // rows
+                std::vector<int> data(r*be1*be2);
+                for (int i = 0; i < r*be1*be2; i++) {
+                    data[i] = rand() % m;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, r * be1 * be2 * sizeof(int));
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+// GGML_OP_GET_ROWS_BACK
+struct test_get_rows_back : public test_case {
+    const ggml_type type;
+    const int n; // cols
+    const int m; // rows
+    const int r; // rows to get
+    const int b; // batch size
+    const bool v; // view (non-contiguous src1)
+
+    std::string vars() override {
+        return VARS_TO_STR6(type, n, m, r, b, v);
+    }
+
+    test_get_rows_back(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
+        : type(type), n(n), m(m), r(r), b(b), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * in_forward = ggml_new_tensor_3d(ctx, type, n, m, b);
+        ggml_set_name(in_forward, "in_forward");
+
+        ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
+        ggml_set_name(rows, "rows");
+        if (v) {
+            rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
+            ggml_set_name(rows, "view_of_rows");
+        }
+
+        ggml_tensor * grad = ggml_new_tensor_3d(ctx, type, n, r, b);
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * out = ggml_get_rows_back(ctx, grad, rows, in_forward);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) { continue; }
+                // rows
+                std::vector<int> data(r*b);
+                for (int i = 0; i < r*b; i++) {
+                    data[i] = rand() % m;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+static void init_set_rows_row_ids(ggml_tensor * t, int num_rows) {
+    std::random_device rd;
+    std::default_random_engine rng(rd());
+    for (int i2 = 0; i2 < t->ne[2]; i2++) {
+        for (int i1 = 0; i1 < t->ne[1]; i1++) {
+            // generate a shuffled subset of row indices
+            std::vector<int64_t> data(num_rows);
+            for (int i = 0; i < num_rows; i++) {
+                data[i] = i;
+            }
+            std::shuffle(data.begin(), data.end(), rng);
+            data.resize(t->ne[0]);
+
+            const size_t offs = i1*t->nb[1] + i2*t->nb[2];
+            if (t->type == GGML_TYPE_I32) {
+                // TODO: Make a template or something
+                std::vector<int32_t> data_i32(t->ne[0]);
+                for (int i = 0; i < t->ne[0]; i++) {
+                    data_i32[i] = static_cast<int32_t>(data[i]);
+                }
+                ggml_backend_tensor_set(t, data_i32.data(), offs, t->ne[0]*sizeof(int32_t));
+            } else {
+                ggml_backend_tensor_set(t, data.data(), offs, t->ne[0]*sizeof(int64_t));
+            }
+        }
+    }
+}
+
+// GGML_OP_SET_ROWS
+struct test_set_rows : public test_case {
+    const ggml_type type;
+    const ggml_type type_idx;
+    const std::array<int64_t, 4> ne;
+    const std::array<int, 2> nr23; // broadcast only dims 2 and 3
+    const int r; // rows to set
+    const bool v; // view (non-contiguous src1)
+
+    std::string vars() override {
+        return VARS_TO_STR6(type, type_idx, ne, nr23, r, v);
+    }
+
+    test_set_rows(ggml_type type,
+            ggml_type type_idx,
+            std::array<int64_t, 4> ne,
+            std::array<int, 2> nr23,
+            int r, bool v = false)
+        : type(type), type_idx(type_idx), ne(ne), nr23(nr23), r(r), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * dst = ggml_new_tensor_4d(ctx, type,          ne[0], ne[1], ne[2]*nr23[0], ne[3]*nr23[1]);
+        ggml_set_name(dst, "dst");
+
+        ggml_tensor * src = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], r,     ne[2]*nr23[0], ne[3]*nr23[1]);
+        ggml_set_name(src, "src");
+
+        ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, type_idx, r, ne[2], ne[3]);
+        ggml_set_name(row_idxs, "row_idxs");
+
+        if (v) {
+            src      = ggml_view_4d(ctx, src, ne[0], r/2, ne[2]*nr23[0], ne[3]*nr23[1], src->nb[1], src->nb[2], src->nb[3], 0);
+            row_idxs = ggml_view_3d(ctx, row_idxs, r/2, ne[2], ne[3], row_idxs->nb[1], row_idxs->nb[2], 0);
+            ggml_set_name(row_idxs, "view_of_rows");
+        }
+
+        ggml_tensor * out = ggml_set_rows(ctx, dst, src, row_idxs);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I64 || t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) {
+                    continue;
+                }
+
+                init_set_rows_row_ids(t, ne[1]);
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+
+    double max_nmse_err() override {
+        if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_IQ4_NL ||
+            type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1 || type == GGML_TYPE_Q8_0) {
+            // estimate what the max nmse error would be if one quantized value is
+            // off by one. The test values are distributed in [-1,1], so it'll be
+            // roughly (2.0 / 2^bits)^2, divided by the mean square value of the reference,
+            // which is roughly 0.25 times the number of elements.
+            double err_estimate = 1.0f/8.0f;
+            if (type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
+                err_estimate /= 2.0f;
+            }
+            if (type == GGML_TYPE_Q8_0) {
+                err_estimate /= 8.0f;
+            }
+            err_estimate *= err_estimate;
+            err_estimate /= 0.25f*float(ne[0] * r * ne[2]*nr23[0] * ne[3]*nr23[1]);
+            return err_estimate;
+        }
+        return 1e-7;
+    }
+};
+
+// GGML_OP_ROPE + GGML_OP_VIEW + GGML_OP_SET_ROWS
+struct test_rope_set_rows : public test_case {
+    const ggml_type type;
+    const ggml_type type_idx;
+    const std::array<int64_t, 4> ne_a;
+    int mode;
+    const int n_ctx{512};
+    const int n_dims{128};
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, type_idx, ne_a, mode);
+    }
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "ROPE_SET_ROWS";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    test_rope_set_rows(ggml_type type,
+            ggml_type type_idx,
+            std::array<int64_t, 4> ne_a,
+            int mode)
+        : type(type), type_idx(type_idx), ne_a(ne_a), mode(mode) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne_a[0], ne_a[1], ne_a[2], 1);
+        ggml_set_name(a, "a");
+
+        const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+        const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+        ggml_tensor * pos;
+        if (is_mrope || is_vision) {
+            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2] * 4);
+        } else {
+            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
+        }
+        ggml_set_name(pos, "pos");
+
+        float fs = 1.4245f;
+        float ef = 0.7465f;
+        float af = 1.4245f;
+        ggml_tensor * freq = nullptr;
+
+        ggml_tensor * rope = nullptr;
+        if (is_mrope) {
+            if (is_vision) {
+                GGML_ASSERT(n_dims/4 > 0);
+                int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
+                rope = ggml_rope_multi(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+            } else {
+                GGML_ASSERT(n_dims/3 > 0);
+                int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
+                rope = ggml_rope_multi(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+            }
+        } else {
+            rope = ggml_rope(ctx, a, pos, ne_a[0], mode);
+        }
+
+        ggml_tensor * view = ggml_view_2d(ctx, rope, ne_a[0] * ne_a[1], ne_a[2], rope->nb[2], 0);
+
+        ggml_tensor * dst = ggml_new_tensor_4d(ctx, type, ne_a[0] * ne_a[1], ne_a[2] * ne_a[3], 1, 1);
+        ggml_set_name(dst, "dst");
+
+        ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, type_idx, ne_a[2], 1, 1);
+        ggml_set_name(row_idxs, "row_idxs");
+
+        ggml_tensor * out = ggml_set_rows(ctx, dst, view, row_idxs);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (strcmp(t->name, "row_idxs") == 0) {
+                if (ggml_is_view_op(t->op)) {
+                    continue;
+                }
+                init_set_rows_row_ids(t, ne_a[2]);
+            } else if (t->type == GGML_TYPE_I32) {
+                // pos
+                const int num_pos_ids = (mode & GGML_ROPE_TYPE_MROPE) ? ne_a[2] * 4 : ne_a[2];
+                std::vector<int> data(num_pos_ids);
+                for (int i = 0; i < num_pos_ids; i++) {
+                    data[i] = rand() % n_ctx;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
+            } else {
+                if (t->ne[0] == n_dims/2) {
+                    // frequency factors in the range [0.9f, 1.1f]
+                    init_tensor_uniform(t, 0.9f, 1.1f);
+                } else {
+                    init_tensor_uniform(t);
+                }
+            }
+        }
+    }
+};
+
+// GGML_OP_RMS_NORM + GGML_OP_MUL + GGML_OP_ROPE (+ GGML_OP_VIEW + GGML_OP_SET_ROWS)
+struct test_rms_norm_mul_rope : public test_case {
+    const std::array<int64_t, 4> ne;
+    const float eps;
+    const bool multi_add; // test a sequence of adds feeding into rms_norm
+    const bool set_rows;
+    int mode;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "RMS_NORM_MUL_ROPE";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    std::string vars() override {
+        return VARS_TO_STR5(ne, eps, multi_add, set_rows, mode);
+    }
+
+    test_rms_norm_mul_rope(std::array<int64_t, 4> ne, float eps = 1e-6f, bool multi_add = false,
+                           bool set_rows = false, int mode = GGML_ROPE_TYPE_NORMAL)
+        : ne(ne), eps(eps), multi_add(multi_add), set_rows(set_rows), mode(mode) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], ne[1], ne[2], 1);
+        ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], ne[1], ne[2], 1);
+        ggml_tensor * c = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], ne[1], ne[2], 1);
+
+        if (multi_add) {
+            a = ggml_add(ctx, ggml_add(ctx, a, b), c);
+        }
+
+        a = ggml_mul(ctx, ggml_rms_norm(ctx, a, eps), b);
+
+        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
+
+        ggml_tensor * rope = ggml_rope(ctx, a, pos, ne[0], mode);
+
+        ggml_tensor * out;
+
+        if (set_rows) {
+            ggml_tensor * view = ggml_view_2d(ctx, rope, ne[0] * ne[1], ne[2], rope->nb[2], 0);
+
+            ggml_tensor * dst = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, ne[0] * ne[1], ne[2] * ne[3], 1, 1);
+            ggml_set_name(dst, "dst");
+
+            ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, GGML_TYPE_I64, ne[2], 1, 1);
+            ggml_set_name(row_idxs, "row_idxs");
+
+            out = ggml_set_rows(ctx, dst, view, row_idxs);
+            ggml_set_name(out, "out");
+        } else {
+            out = rope;
+        }
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I64 || t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) {
+                    continue;
+                }
+
+                init_set_rows_row_ids(t, ne[2]);
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+// GGML_OP_ARGMAX
+struct test_argmax : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_argmax(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 100, 1, 1})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_argmax(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_F32) {
+                // initialize with unique values to avoid ties
+                for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                    std::vector<float> data(t->ne[0]);
+                    for (int i = 0; i < t->ne[0]; i++) {
+                        data[i] = i;
+                    }
+                    std::shuffle(data.begin(), data.end(), rng);
+                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
+                }
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+
+    double max_nmse_err() override {
+        return 0.0;
+    }
+};
+
+// GGML_OP_COUNT_EQUAL
+struct test_count_equal : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_count_equal(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {4, 500, 1, 1})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * a_argmax = ggml_argmax(ctx, a);
+        ggml_set_name(a_argmax, "a_argmax");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        ggml_tensor * b_argmax = ggml_argmax(ctx, b);
+        ggml_set_name(b_argmax, "b_argmax");
+
+        ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    double max_nmse_err() override {
+        return 0.0;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_F32) {
+                // initialize with unique values to avoid ties
+                for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                    std::vector<float> data(t->ne[0]);
+                    for (int i = 0; i < t->ne[0]; i++) {
+                        data[i] = i;
+                    }
+                    std::shuffle(data.begin(), data.end(), rng);
+                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
+                }
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+// GGML_OP_REPEAT
+struct test_repeat : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int, 4> nr;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, nr);
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) * 2;
+    }
+
+    test_repeat(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            std::array<int, 4> nr = {2, 2, 2, 2})
+        : type(type), ne(ne), nr(nr) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
+        ggml_set_name(target, "target");
+
+        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(src);
+        ggml_set_name(src, "src");
+
+        ggml_tensor * out = ggml_repeat(ctx, src, target);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_REPEAT_BACK
+struct test_repeat_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int, 4> nr;
+    const bool v; // whether src is a noncontiguous view
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, nr, v);
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) * 2;
+    }
+
+    test_repeat_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {8, 6, 4, 2},
+            std::array<int, 4> nr = {2, 2, 2, 2},
+            bool v = false)
+        : type(type), ne(ne), nr(nr), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
+        ggml_set_name(src, "src");
+
+        if (v) {
+            GGML_ASSERT(ne[0] % 2 == 0);
+            GGML_ASSERT(ne[1] % 2 == 0);
+            GGML_ASSERT(ne[2] % 2 == 0);
+            GGML_ASSERT(ne[3] % 2 == 0);
+            GGML_ASSERT(nr[0] % 2 == 0 || nr[0] == 1);
+            GGML_ASSERT(nr[1] % 2 == 0 || nr[1] == 1);
+            GGML_ASSERT(nr[2] % 2 == 0 || nr[2] == 1);
+            GGML_ASSERT(nr[3] % 2 == 0 || nr[3] == 1);
+
+            const int64_t ne00 = nr[0] == 1 ? src->ne[0] : src->ne[0] / 2;
+            const int64_t ne01 = nr[1] == 1 ? src->ne[1] : src->ne[1] / 2;
+            const int64_t ne02 = nr[2] == 1 ? src->ne[2] : src->ne[2] / 2;
+            const int64_t ne03 = nr[3] == 1 ? src->ne[3] : src->ne[3] / 2;
+
+            src = ggml_view_4d(ctx, src, ne00, ne01, ne02, ne03, src->nb[1], src->nb[2], src->nb[3], 0);
+        }
+
+        ggml_tensor * target = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(target, "target");
+
+        ggml_tensor * out = ggml_repeat_back(ctx, src, target);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_DUP
+struct test_dup : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> permute;
+    bool _use_permute;
+
+    std::string vars() override {
+        std::string v = VARS_TO_STR2(type, ne);
+        if (_use_permute) v += "," + VAR_TO_STR(permute);
+        return v;
+    }
+
+    test_dup(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 20, 1},
+            std::array<int64_t, 4> permute = {0, 0, 0, 0})
+        : type(type), ne(ne), permute(permute),
+            _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(src);
+        ggml_set_name(src, "src");
+
+        if (_use_permute) {
+            src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
+            ggml_set_name(src, "src_permuted");
+        }
+
+        ggml_tensor * out = ggml_dup(ctx, src);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_SET
+struct test_set : public test_case {
+    const ggml_type type_src;
+    const ggml_type type_dst;
+    const std::array<int64_t, 4> ne;
+    const int dim;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type_src, type_dst, ne, dim);
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
+    }
+
+    test_set(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {6, 5, 4, 3}, int dim = 1)
+        : type_src(type_src), type_dst(type_dst), ne(ne), dim(dim) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
+        ggml_set_param(src);
+        ggml_set_name(src, "src");
+
+        auto ne_dst = ne;
+        for (int i = 0; i < dim; ++i) {
+            ne_dst[i] *= 2;
+        }
+        ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
+        ggml_set_param(dst);
+        ggml_set_name(dst, "dst");
+
+        size_t offset = 0;
+        for (int i = 0; i < dim; ++i) {
+            offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i];
+        }
+        ggml_tensor * out = ggml_set(ctx, dst, src,
+            // The backward pass requires setting a contiguous region:
+            src->nb[1], src->nb[2], src->nb[3], offset);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_CPY
+struct test_cpy : public test_case {
+    const ggml_type type_src;
+    const ggml_type type_dst;
+    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> permute_src;
+    const std::array<int64_t, 4> permute_dst;
+    bool _src_use_permute;
+    bool _dst_use_permute;
+    bool _src_transpose;
+
+    std::string vars() override {
+        return VARS_TO_STR6(type_src, type_dst, ne, permute_src, permute_dst, _src_transpose);
+    }
+
+    double max_nmse_err() override {
+        if (type_src == type_dst) {
+            return 0.0;
+        }
+        if (type_dst == GGML_TYPE_Q4_0 || type_dst == GGML_TYPE_Q4_1 || type_dst == GGML_TYPE_IQ4_NL ||
+            type_dst == GGML_TYPE_Q5_0 || type_dst == GGML_TYPE_Q5_1 || type_dst == GGML_TYPE_Q8_0) {
+            // estimate what the max nmse error would be if one quantized value is
+            // off by one. The test values are distributed in [-150,150], so it'll be
+            // roughly (150*2.0 / 2^bits)^2, divided by the mean square value of the reference,
+            // which is roughly 0.25*150^2 times the number of elements.
+            double err_estimate = 1.0f/8.0f * 150.0f;
+            if (type_dst == GGML_TYPE_IQ4_NL) {
+                // iq4_nl values are a bit more spread out
+                err_estimate *= 2.0f;
+            }
+            if (type_dst == GGML_TYPE_Q5_0 || type_dst == GGML_TYPE_Q5_1) {
+                err_estimate /= 2.0f;
+            }
+            if (type_dst == GGML_TYPE_Q8_0) {
+                err_estimate /= 8.0f;
+            }
+            err_estimate *= err_estimate;
+            err_estimate /= (150.0f*150.0f*0.25f)*float(ne[0] * ne[1] * ne[2] * ne[3]);
+            return err_estimate;
+        }
+        return 1e-6;
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
+    }
+
+    test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
+            std::array<int64_t, 4> permute_dst = {0, 0, 0, 0},
+            bool transpose_src = false)
+        : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst),
+          _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
+          _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0),
+          _src_transpose(transpose_src){}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
+        ggml_set_param(src);
+        ggml_set_name(src, "src");
+
+        if (_src_use_permute) {
+            src = ggml_permute(ctx, src, permute_src[0], permute_src[1], permute_src[2], permute_src[3]);
+            ggml_set_name(src, "src_permuted");
+        }
+
+        if (_src_transpose) {
+            src = ggml_transpose(ctx, src);
+            ggml_set_name(src, "src_transposed");
+        }
+
+        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
+        ggml_set_name(dst, "dst");
+
+        if (_dst_use_permute) {
+            dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]);
+            ggml_set_name(dst, "dst_permuted");
+        }
+
+        ggml_tensor * out = ggml_cpy(ctx, src, dst);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // test extended range of values to check if casting between f32 and i32 is consistent
+            init_tensor_uniform(t, -150.f, 150.f);
+        }
+    }
+};
+
+// GGML_OP_CONT
+struct test_cont : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    bool use_view_slice;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, use_view_slice);
+    }
+
+    test_cont(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            bool use_view_slice = false)
+        : type(type), ne(ne), use_view_slice(use_view_slice) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(src);
+        ggml_set_name(src, "src");
+
+
+        ggml_tensor * dst;
+        if (use_view_slice) {
+            dst = ggml_view_4d(ctx, src, src->ne[0], 1, src->ne[2], src->ne[3],
+                src->nb[1], src->nb[2], src->nb[3], src->nb[0] * (src->ne[1] - 1));
+            ggml_set_name(dst, "src_view_slice");
+        } else {
+            dst = ggml_transpose(ctx, src);
+            ggml_set_name(dst, "src_transposed");
+        }
+
+        ggml_tensor * out = ggml_cont(ctx, dst);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_ADD
+// GGML_OP_SUB
+// GGML_OP_MUL
+// GGML_OP_DIV
+struct test_bin_bcast : public test_case {
+    using op_t = ggml_tensor * (*) (ggml_context *, ggml_tensor *, ggml_tensor *);
+    op_t op;
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int, 4> nr;
+    int nf; // number of fused ops, nf == 1 -> single op (no fusion)
+    bool perm1; // permute src1?
+
+    bool run_whole_graph() override { return nf > 1; }
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, ne, nr, nf, perm1);
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) * 3;
+    }
+
+    test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 1, 1},
+            std::array<int, 4> nr = {1, 2, 1, 1},
+            int nf = 1,
+            bool perm1 = false)
+        : op(op), type(type), ne(ne), nr(nr), nf(nf), perm1(perm1) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        GGML_ASSERT(nf <= 16);
+
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b[16];
+        for (int i = 0; i < nf; ++i) {
+            if (perm1) {
+                const int p[4] = { 1, 2, 0, 3 }; // hardcoded for now
+
+                b[i] = ggml_new_tensor_4d(ctx, type, ne[p[0]], ne[p[1]], ne[p[2]], ne[p[3]]);
+                b[i] = ggml_permute(ctx, b[i], p[0], p[1], p[2], p[3]);
+            } else {
+                b[i] = ggml_new_tensor(ctx, type, 4, ne.data());
+            }
+            ggml_set_name(b[i], (std::string("b") + std::to_string(i)).c_str());
+        }
+
+        // The backward pass supports broadcasting only for GGML_ADD:
+        const bool grad_supported = op == ggml_add && ggml_are_same_shape(a, b[0]) && nf == 1 && !perm1;
+        if (grad_supported) {
+            ggml_set_param(a);
+            ggml_set_param(b[0]);
+        }
+
+        ggml_tensor * out = a;
+
+        for (int i = 0; i < nf; ++i) {
+            out = op(ctx, out, b[i]);
+        }
+
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (op == ggml_mul || op == ggml_div) {
+                // MUL and DIV have numerical issues around zero:
+                init_tensor_uniform(t, 0.9f, 1.1f);
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+
+    float grad_eps() override {
+        return 0.1f * (op == ggml_mul ? ne[0]*ne[1]*ne[2]*ne[3] : 1);
+    }
+
+    bool grad_precise() override {
+        return op == ggml_div;
+    }
+
+    double max_maa_err() override {
+        return op == ggml_add ? 1e-4 : 1e-3;
+    }
+};
+
+// GGML_OP_ADD_ID
+struct test_add_id : public test_case {
+    const ggml_type type_a;
+    const ggml_type type_b;
+    const int64_t n_embd;
+    const int64_t n_experts;
+    const int64_t n_experts_used;
+    const int64_t n_token;
+
+    std::string vars() override {
+        return VARS_TO_STR6(type_a, type_b, n_embd, n_experts, n_experts_used, n_token);
+    }
+
+    size_t op_size(ggml_tensor * t) override {
+        return ggml_nbytes(t) + ggml_nbytes(t->src[0]) + ggml_nbytes(t->src[2]);
+    }
+
+    test_add_id(ggml_type type_a = GGML_TYPE_F32,
+            ggml_type type_b = GGML_TYPE_F32,
+            int64_t n_embd = 128,
+            int64_t n_experts = 16,
+            int64_t n_experts_used = 8,
+            int64_t n_token = 10)
+        : type_a(type_a), type_b(type_b), n_embd(n_embd),
+          n_experts(n_experts), n_experts_used(n_experts_used), n_token(n_token) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_3d(ctx, type_a, n_embd, n_experts_used, n_token);
+        ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, n_embd, n_experts);
+        ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_experts, n_token);
+        if (n_experts_used != n_experts) {
+            ids = ggml_view_2d(ctx, ids, n_experts_used, n_token, ids->nb[1], 0);
+            ggml_set_name(ids, "view_of_ids");
+        }
+
+        ggml_tensor * out = ggml_add_id(ctx, a, b, ids);
+        ggml_set_name(out, "out");
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) { continue; }
+                std::random_device rd;
+                std::default_random_engine rng(rd());
+                // ids
+                for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                    std::vector<int32_t> data(t->ne[0]);
+                    for (int i = 0; i < t->ne[0]; i++) {
+                        data[i] = i % n_experts;
+                    }
+                    std::shuffle(data.begin(), data.end(), rng);
+                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
+                }
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+// GGML_OP_ADD1
+struct test_add1 : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_add1(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1);
+        // ggml_set_param(b); // TODO: implement
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_add1(ctx, a, b);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
+    }
+};
+
+// GGML_OP_SCALE
+struct test_scale : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    float scale;
+    float bias;
+    bool inplace;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, ne, scale, bias, inplace);
+    }
+
+    test_scale(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 10, 10},
+            float scale = 2.0f,
+            float bias = 0.0f,
+            bool inplace = false)
+        : type(type), ne(ne), scale(scale), bias(bias), inplace(inplace) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out;
+        if (inplace) {
+            out = ggml_scale_bias_inplace(ctx, a, scale, bias);
+        } else {
+            out = ggml_scale_bias(ctx, a, scale, bias);
+        }
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_SCALE + GGML_UNARY_OP_TANH + GGML_OP_SCALE
+struct test_softcap : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    float softcap;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "SOFTCAP";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, softcap);
+    }
+
+    test_softcap(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 10, 10},
+            float softcap = 30.0f)
+        : type(type), ne(ne), softcap(softcap) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_scale(ctx, ggml_tanh(ctx, ggml_scale(ctx, a, 1.0f / softcap)), softcap);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_SILU_BACK
+struct test_silu_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, eps);
+    }
+
+    test_silu_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            float eps = 1e-6f)
+        : type(type), ne(ne), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * grad = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * out = ggml_silu_back(ctx, a, grad);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_NORM
+struct test_norm : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const bool v; // whether a is a non-contiguous view
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, v, eps);
+    }
+
+    test_norm(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            bool v = false,
+            float eps = 1e-6f)
+        : type(type), ne(ne), v(v), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        if (v) {
+            a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view of a");
+        }
+
+        ggml_tensor * out = ggml_norm(ctx, a, eps);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_NORM + GGML_OP_MUL + GGML_OP_ADD
+struct test_norm_mul_add : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    float eps;
+    const bool broadcast;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "NORM_MUL_ADD";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, eps, broadcast);
+    }
+
+    test_norm_mul_add(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {128, 2, 1, 1},
+            float eps = 1e-5f,
+            bool broadcast = false)
+        : type(type), ne(ne), eps(eps), broadcast(broadcast) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        std::array<int64_t, 4> broadcast_dims = {ne[0], ne[1] * 2, ne[2] * 2, ne[3] * 2};
+
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, broadcast ? broadcast_dims.data() : ne.data());
+        ggml_tensor * w = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a); ggml_set_param(w); ggml_set_param(b);
+        ggml_set_name(a, "a"); ggml_set_name(w, "w"); ggml_set_name(b, "b");
+
+        // Use a, w and b early to avoid OP_NONE in graph
+        a = ggml_add(ctx, ggml_add(ctx, a, w), b);
+
+        ggml_tensor * n = ggml_norm(ctx, a, eps);
+        ggml_tensor * m = ggml_mul(ctx, n, w);
+        ggml_tensor * out = ggml_add(ctx, m, b);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+// GGML_OP_RMS_NORM
+struct test_rms_norm : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const bool v; // whether a is a non-contiguous view
+    const float eps;
+    const bool inplace; // whether to do the operation inplace
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, ne, v, eps, inplace);
+    }
+
+    test_rms_norm(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            bool v = false,
+            float eps = 1e-6f,
+            bool inplace = false)
+        : type(type), ne(ne), v(v), eps(eps), inplace(inplace) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        if (v) {
+            a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view of a");
+        }
+
+        ggml_tensor * out;
+        if (inplace) {
+            out = ggml_rms_norm_inplace(ctx, a, eps);
+        } else {
+            out = ggml_rms_norm(ctx, a, eps);
+        }
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.f, 10.f);
+        }
+    }
+
+    float grad_eps() override {
+        return 1.0f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_RMS_NORM_BACK
+struct test_rms_norm_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, eps);
+    }
+
+    test_rms_norm_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            float eps = 1e-6f)
+        : type(type), ne(ne), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_rms_norm_back(ctx, a, b, eps);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.f, 10.f);
+        }
+    }
+};
+
+// GGML_OP_RMS_NORM + GGML_OP_MUL + GGML_OP_ADD
+struct test_rms_norm_mul_add : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const float eps;
+    const bool broadcast;
+    const bool multi_add; // test a sequence of adds feeding into rms_norm
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "RMS_NORM_MUL_ADD";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, ne, eps, broadcast, multi_add);
+    }
+
+    test_rms_norm_mul_add(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            float eps = 1e-6f, bool broadcast = false, bool multi_add = false)
+        : type(type), ne(ne), eps(eps), broadcast(broadcast), multi_add(multi_add) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        std::array<int64_t, 4> broadcast_dims = {ne[0]*2, ne[1]*3, ne[2]*3, ne[3]*4};
+
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, broadcast ? broadcast_dims.data() : ne.data());
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_tensor * c = ggml_new_tensor(ctx, type, 4, ne.data());
+
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+        ggml_set_param(b);
+        ggml_set_name(b, "b");
+        ggml_set_param(c);
+        ggml_set_name(c, "c");
+
+        // Use a, b and c early, so we don't end up with an OP_NONE between rms_norm and mul
+        a = ggml_add(ctx, ggml_add(ctx, a, b), c);
+        if (multi_add) {
+            a = ggml_add(ctx, ggml_add(ctx, a, b), c);
+        }
+        ggml_tensor * out = ggml_add(ctx, ggml_mul(ctx, ggml_rms_norm(ctx, a, eps), b), c);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.f, 10.f);
+        }
+    }
+
+    float grad_eps() override {
+        return 1.0f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_ADD + GGML_OP_RMS_NORM (fused operation)
+struct test_add_rms_norm : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const float eps;
+    const bool broadcast;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "ADD_RMS_NORM";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, eps, broadcast);
+    }
+
+    test_add_rms_norm(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 5, 4, 3},
+            float eps = 1e-6f, bool broadcast = false)
+        : type(type), ne(ne), eps(eps), broadcast(broadcast) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        std::array<int64_t, 4> broadcast_dims = {ne[0]*2, ne[1]*3, ne[2]*3, ne[3]*4};
+
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, broadcast ? broadcast_dims.data() : ne.data());
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+        ggml_set_param(b);
+        ggml_set_name(b, "b");
+
+        // ADD operation followed by RMS_NORM
+        ggml_tensor * add_result = ggml_add(ctx, a, b);
+        ggml_set_name(add_result, "add_result");
+
+        ggml_tensor * out = ggml_rms_norm(ctx, add_result, eps);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.f, 10.f);
+        }
+    }
+
+    float grad_eps() override {
+        return 1.0f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_SSM_CONV
+struct test_ssm_conv : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const std::array<int64_t, 4> ne_b;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne_a, ne_b);
+    }
+
+    test_ssm_conv(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
+            std::array<int64_t, 4> ne_b = {3, 3, 1, 1})
+        : type(type), ne_a(ne_a), ne_b(ne_b) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a   = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_tensor * b   = ggml_new_tensor(ctx, type, 4, ne_b.data());
+        ggml_tensor * out = ggml_ssm_conv(ctx, a, b);
+        return out;
+    }
+};
+
+// GGML_OP_SSM_SCAN
+struct test_ssm_scan : public test_case {
+    const ggml_type type;
+
+    const int64_t d_state;
+    const int64_t head_dim;
+    const int64_t n_head;
+    const int64_t n_group;
+    const int64_t n_seq_tokens;
+    const int64_t n_seqs;
+
+    std::string vars() override {
+        return VARS_TO_STR7(type, d_state, head_dim, n_head, n_group, n_seq_tokens, n_seqs);
+    }
+
+    test_ssm_scan(ggml_type type = GGML_TYPE_F32,
+            int64_t d_state = 32,
+            int64_t head_dim = 1, // non-zero for Mamba-2
+            int64_t n_head  = 32,
+            int64_t n_group = 1,
+            int64_t n_seq_tokens = 32,
+            int64_t n_seqs = 32)
+        : type(type), d_state(d_state), head_dim(head_dim), n_head(n_head), n_group(n_group), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * s   = ggml_new_tensor_4d(ctx, type, d_state,  head_dim,     n_head,       n_seqs);
+        ggml_tensor * x   = ggml_new_tensor_4d(ctx, type, head_dim, n_head,       n_seq_tokens, n_seqs);
+        ggml_tensor * dt  = ggml_new_tensor_3d(ctx, type, n_head,   n_seq_tokens, n_seqs);
+        ggml_tensor * A   = ggml_new_tensor_2d(ctx, type, (head_dim > 1) ? 1 : d_state, n_head);
+        ggml_tensor * B   = ggml_new_tensor_4d(ctx, type, d_state,  n_group,      n_seq_tokens, n_seqs);
+        ggml_tensor * C   = ggml_new_tensor_4d(ctx, type, d_state,  n_group,      n_seq_tokens, n_seqs);
+        ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,  n_seqs);
+        ggml_tensor * out = ggml_ssm_scan(ctx, s, x, dt, A, B, C, ids);
+        return out;
+    }
+
+    // similar to test_mul_mat_id
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) { continue; }
+                // ids
+                for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                    std::vector<int32_t> data(t->ne[0]);
+                    for (int i = 0; i < t->ne[0]; i++) {
+                        data[i] = i;
+                    }
+                    std::shuffle(data.begin(), data.end(), rng);
+                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
+                }
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+// GGML_OP_RWKV_WKV6
+struct test_rwkv_wkv6 : public test_case {
+    const ggml_type type;
+
+    const int64_t head_count;
+    const int64_t head_size;
+    const int64_t n_seq_tokens;
+    const int64_t n_seqs;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
+    }
+
+    test_rwkv_wkv6(ggml_type type = GGML_TYPE_F32,
+            int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+        : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t n_tokens = n_seq_tokens * n_seqs;
+        ggml_tensor * r   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * k   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * v   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * tf  = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
+        ggml_tensor * td  = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
+        ggml_tensor * out = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, s);
+        return out;
+    }
+};
+
+// GGML_OP_GATED_LINEAR_ATTN
+struct test_gla : public test_case {
+    const ggml_type type;
+
+    const int64_t head_count;
+    const int64_t head_size;
+    const int64_t n_seq_tokens;
+    const int64_t n_seqs;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
+    }
+
+    test_gla(ggml_type type = GGML_TYPE_F32,
+            int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+        : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t n_tokens = n_seq_tokens * n_seqs;
+        ggml_tensor * q   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * k   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * v   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * g   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
+        ggml_tensor * out = ggml_gated_linear_attn(ctx, k, v, q, g, s, pow(head_size, -0.5));
+        return out;
+    }
+};
+
+// GGML_OP_RWKV_WKV7
+struct test_rwkv_wkv7 : public test_case {
+    const ggml_type type;
+
+    const int64_t head_count;
+    const int64_t head_size;
+    const int64_t n_seq_tokens;
+    const int64_t n_seqs;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
+    }
+
+    test_rwkv_wkv7(ggml_type type = GGML_TYPE_F32,
+            int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
+        : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t n_tokens = n_seq_tokens * n_seqs;
+        ggml_tensor * r   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * w   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * k   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * v   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * a   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        ggml_tensor * b   = ggml_new_tensor(ctx, type, 3, std::vector<int64_t>{ head_size, head_count, n_tokens }.data());
+        // Outputs may become NaN with long seqlen without these normalization
+        a = ggml_l2_norm(ctx, a, 1e-7F);
+        b = ggml_l2_norm(ctx, b, 1e-7F);
+        ggml_tensor * s   = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
+        ggml_tensor * out = ggml_rwkv_wkv7(ctx, r, w, k, v, a, b, s);
+        return out;
+    }
+};
+
+// GGML_OP_MUL_MAT
+struct test_mul_mat : public test_case {
+    const ggml_type type_a;
+    const ggml_type type_b;
+    const int64_t m;
+    const int64_t n;
+    const int64_t k;
+    const std::array<int64_t, 2> bs;  // dims 3 and 4
+    const std::array<int64_t, 2> nr;  // repeat in dims 3 and 4
+    const std::array<int64_t, 4> per; // permutation of dimensions
+    const int64_t k_v; // size of k in memory, resulting in a non-contiguous view for k_v > k, no view for k_v == 0
+    const uint32_t o; // number of outputs
+
+    std::string vars() override {
+        return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, k_v, o);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    double max_nmse_err(ggml_backend_t backend) override {
+        // for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance
+        if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
+            return 2e-2;
+        }
+        return max_nmse_err();
+    }
+
+    int64_t grad_nmax() override {
+        return 20000;
+    }
+
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return 2 * m * n * k * bs[0] * nr[0] * bs[1] * nr[1];
+    }
+
+    test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
+            int64_t m = 32, int64_t n = 32, int64_t k = 32,
+            std::array<int64_t, 2> bs = {10, 10},
+            std::array<int64_t, 2> nr = {2, 2},
+            std::array<int64_t, 4> per = {0, 1, 2, 3},
+            int64_t k_v = 0, uint32_t o = 1)
+        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), k_v(k_v), o(o) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
+        ggml_tensor * a;
+        ggml_tensor * b;
+
+        const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
+        if (npermuted > 0) {
+            GGML_ASSERT(npermuted == 2);
+            GGML_ASSERT(k_v == 0); // not handled
+            GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
+            GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
+
+            // Create tensors with the permuted dimensions, then permute them back to the dimensions given by m,n,k.
+            const int64_t ne_a[4] = {k, m, bs[0],       bs[1]};
+            const int64_t ne_b[4] = {k, n, bs[0]*nr[0], bs[1]*nr[1]};
+
+            a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
+            b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
+            if (!ggml_is_quantized(type_a)) {
+                if (bs[1] == 1 && nr[1] == 1) {
+                    ggml_set_param(a);
+                }
+                ggml_set_param(b);
+            }
+            ggml_set_name(a, "a");
+            ggml_set_name(b, "b");
+
+            a = ggml_permute(ctx, a, per[0], per[1], per[2], per[3]);
+            b = ggml_permute(ctx, b, per[0], per[1], per[2], per[3]);
+            ggml_set_name(a, "a_permuted");
+            ggml_set_name(b, "b_permuted");
+        } else {
+            const int64_t k_physical = k_v == 0 ? k : k_v;
+            a = ggml_new_tensor_4d(ctx, type_a, k_physical, m, bs[0],       bs[1]);
+            b = ggml_new_tensor_4d(ctx, type_b, k_physical, n, bs[0]*nr[0], bs[1]*nr[1]);
+
+            if (!ggml_is_quantized(type_a)) {
+                if (bs[1] == 1 && nr[1] == 1) {
+                    ggml_set_param(a);
+                }
+                ggml_set_param(b);
+            }
+
+            if (k_v != 0) {
+                GGML_ASSERT(k_v > k);
+                a = ggml_view_4d(ctx, a, k, m, bs[0],       bs[1],       a->nb[1], a->nb[2], a->nb[3], 0);
+                b = ggml_view_4d(ctx, b, k, n, bs[0]*nr[0], bs[1]*nr[1], b->nb[1], b->nb[2], b->nb[3], 0);
+            }
+            ggml_set_name(a, "a");
+            ggml_set_name(b, "b");
+        }
+
+        ggml_tensor * out = ggml_mul_mat(ctx, a, b);
+        ggml_set_name(out, "out");
+        for (uint32_t i = 1; i < o; ++i) {
+            ggml_tensor * out2 = ggml_mul_mat(ctx, a, b);
+            ggml_set_name(out2, "out2");
+            out = ggml_add(ctx, out, out2);
+        }
+
+        return out;
+    }
+
+    bool run_whole_graph() override { return o > 1; }
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return ggml_op_name(GGML_OP_MUL_MAT);
+    }
+};
+
+static void init_mul_mat_id_tensors(ggml_context * ctx, int n_mats) {
+    std::random_device rd;
+    std::default_random_engine rng(rd());
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        if (t->type == GGML_TYPE_I32) {
+            if (ggml_is_view_op(t->op)) { continue; }
+            // ids
+            for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                std::vector<int32_t> data(t->ne[0]);
+                for (int i = 0; i < t->ne[0]; i++) {
+                    data[i] = i % n_mats;
+                }
+                std::shuffle(data.begin(), data.end(), rng);
+                ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
+            }
+        } else {
+            init_tensor_uniform(t);
+        }
+    }
+}
+
+// GGML_OP_MUL_MAT_ID
+struct test_mul_mat_id : public test_case {
+    const ggml_type type_a;
+    const ggml_type type_b;
+    const int n_mats;
+    const int n_used;
+    const bool b; // broadcast b matrix
+    const int64_t m;
+    const int64_t n;
+    const int64_t k;
+
+    std::string vars() override {
+        return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    double max_nmse_err(ggml_backend_t backend) override {
+        // for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance
+        if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) {
+            return 2e-2;
+        }
+        return max_nmse_err();
+    }
+
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return 2 * m * k * n * n_used;
+    }
+
+    test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
+            int n_mats = 8, int n_used = 2, bool b = false,
+            int64_t m = 32, int64_t n = 32, int64_t k = 32)
+        : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
+            m(m), n(n), k(k) {
+            GGML_ASSERT(n_used <= n_mats);
+        }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
+        ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
+        ggml_set_name(as, "as");
+
+        ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
+        ggml_set_name(ids, "ids");
+        if (n_used != n_mats) {
+            ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
+            ggml_set_name(ids, "view_of_ids");
+        }
+
+        ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        init_mul_mat_id_tensors(ctx, n_mats);
+    }
+};
+
+// GGML_OP_MUL_MAT_ID + GGML_OP_ADD or GGML_OP_MUL
+struct test_mul_mat_id_fusion : public test_case {
+    const ggml_type type_a;
+    const ggml_type type_b;
+    const int n_mats;
+    const int n_used;
+    const bool b; // broadcast b matrix
+    const int64_t m;
+    const int64_t n;
+    const int64_t k;
+    const uint32_t o; // number of outputs
+    const bool mul;
+
+    std::string vars() override {
+        return VARS_TO_STR10(type_a, type_b, n_mats, n_used, b, m, n, k, o, mul);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return 2 * m * k * n * n_used;
+    }
+
+    test_mul_mat_id_fusion(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
+            int n_mats = 8, int n_used = 2, bool b = false,
+            int64_t m = 32, int64_t n = 32, int64_t k = 32, uint32_t o = 1, bool mul = false)
+        : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
+            m(m), n(n), k(k), o(o), mul(mul) {
+            GGML_ASSERT(n_used <= n_mats);
+        }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
+        ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
+        ggml_set_name(as, "as");
+
+        ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
+        ggml_set_name(ids, "ids");
+        if (n_used != n_mats) {
+            ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
+            ggml_set_name(ids, "view_of_ids");
+        }
+
+        ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
+        ggml_set_name(out, "out");
+
+        for (uint32_t i = 1; i < o; ++i) {
+            ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
+            ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids);
+            ggml_set_name(out2, "out2");
+            out = ggml_add(ctx, out, out2);
+        }
+
+        if (mul) {
+            std::array<int64_t, 4> ne { 1, out->ne[1], out->ne[2], out->ne[3] };
+            ne[0] = 1;
+            ggml_tensor * m = ggml_new_tensor(ctx, out->type, 4, ne.data());
+            out = ggml_mul(ctx, out, m);
+        }
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        init_mul_mat_id_tensors(ctx, n_mats);
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "MUL_MAT_ID_FUSION";
+    }
+};
+
+// GGML_OP_OUT_PROD
+struct test_out_prod : public test_case {
+    const ggml_type type_a;
+    const ggml_type type_b;
+    const int64_t m;
+    const int64_t n;
+    const int64_t k;
+    const std::array<int64_t, 2> bs; // dims 3 and 4
+    const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
+    const bool trans_b;
+
+    std::string vars() override {
+        return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, trans_b);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
+            int64_t m = 32, int64_t n = 32, int64_t k = 32,
+            std::array<int64_t, 2> bs = {10, 10},
+            std::array<int64_t, 2> nr = {2, 2},
+            bool trans_b = false)
+        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), trans_b(trans_b) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b;
+        if (trans_b) {
+            b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
+            b = ggml_transpose(ctx, b);
+        } else {
+            b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0]*nr[0], bs[1]*nr[1]);
+        }
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_out_prod(ctx, a, b);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_SQR
+struct test_sqr : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_sqr(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_sqr(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 0.1f * 0.25f*ne[0]*ne[1]*ne[2]*ne[3]; // 10% of expected value of sum.
+    }
+};
+
+// GGML_OP_SQRT
+struct test_sqrt : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_sqrt(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 3, 3, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_sqrt(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        // fill with positive values
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, 50.0f, 100.0f);
+        }
+    }
+
+    float grad_eps() override {
+        return 20.0f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_LOG
+struct test_log : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_log(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_log(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            // log(1) == 0, cluster values there to keep the sum low for better precision in the backward pass:
+            init_tensor_uniform(t, 0.9f, 1.1f);
+        }
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_SIN
+struct test_sin : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_sin(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_sin(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
+        }
+    }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_COS
+struct test_cos : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_cos(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_cos(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
+        }
+    }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    float grad_eps() override {
+        return 0.2f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_CLAMP
+struct test_clamp : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    float min;
+    float max;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, min, max);
+    }
+
+    test_clamp(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            float min = -0.5f, float max = 0.5f)
+        : type(type), ne(ne), min(min), max(max) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_clamp(ctx, a, min, max);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 1e-2f;
+    }
+
+    std::vector<float> grad_expect() override {
+        return {0.0f, 1.0f};
+    }
+};
+
+// GGML_OP_FLOOR
+struct test_floor : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_floor(ggml_type type = GGML_TYPE_F32,
+               std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_floor(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.0f, 10.0f);
+        }
+    }
+};
+
+// GGML_OP_CEIL
+struct test_ceil : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_ceil(ggml_type type = GGML_TYPE_F32,
+              std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_ceil(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.0f, 10.0f);
+        }
+    }
+};
+
+// GGML_OP_ROUND
+struct test_round : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_round(ggml_type type = GGML_TYPE_F32,
+               std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_round(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.0f, 10.0f);
+        }
+    }
+};
+
+// GGML_OP_TRUNC
+struct test_trunc : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_trunc(ggml_type type = GGML_TYPE_F32,
+               std::array<int64_t, 4> ne = {10, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_trunc(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.0f, 10.0f);
+        }
+    }
+};
+
+// GGML_OP_DIAG_MASK_INF
+struct test_diag_mask_inf : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const int n_past;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, n_past);
+    }
+
+    test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 10, 3, 2},
+            int n_past = 5)
+        : type(type), ne(ne), n_past(n_past) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_SOFT_MAX
+struct test_soft_max : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const bool mask;
+    const bool sinks;
+    const ggml_type m_prec;
+    const std::array<int64_t, 2> nr23; // broadcast only dims 2 and 3
+    const float scale;
+    const float max_bias;
+    const bool inplace;
+
+    std::string vars() override {
+        return VARS_TO_STR9(type, ne, mask, sinks, m_prec, nr23, scale, max_bias, inplace);
+    }
+
+    // the 1024 test with bias occasionally fails:
+    // SOFT_MAX(type=f32,ne=[1024,16,1,1],mask=1,scale=1.000000,max_bias=8.000000): [SOFT_MAX] NMSE = 0.000000103 > 0.000000100 FAIL
+    virtual double max_nmse_err() override {
+        return 1e-6;
+    }
+
+    test_soft_max(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            bool mask = false,
+            bool sinks = false,
+            ggml_type m_prec = GGML_TYPE_F32,
+            std::array<int64_t, 2> nr23 = {1, 1},
+            float scale = 1.0f,
+            float max_bias = 0.0f,
+            bool inplace = false)
+        : type(type), ne(ne), mask(mask), sinks(sinks), m_prec(m_prec), nr23(nr23), scale(scale), max_bias(max_bias), inplace(inplace) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2]*nr23[0], ne[3]*nr23[1]);
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * mask = nullptr;
+        if (this->mask) {
+            mask = ggml_new_tensor_4d(ctx, m_prec, ne[0], ne[1], ne[2], ne[3]);
+            ggml_set_name(mask, "mask");
+        }
+
+        ggml_tensor * sinks = nullptr;
+        if (this->sinks) {
+            sinks = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[2]*nr23[0]);
+            ggml_set_name(sinks, "sinks");
+        }
+
+        ggml_tensor * out;
+        if (inplace) {
+            out = ggml_soft_max_ext_inplace(ctx, a, mask, scale, max_bias);
+        } else {
+            out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
+        }
+        ggml_soft_max_add_sinks(out, sinks);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_SOFT_MAX_BACK
+struct test_soft_max_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const float scale;
+    const float max_bias;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, scale, max_bias);
+    }
+
+    test_soft_max_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            float scale = 1.0f,
+            float max_bias = 0.0f)
+        : type(type), ne(ne), scale(scale), max_bias(max_bias) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_soft_max_ext_back(ctx, a, b, scale, max_bias);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_ROPE + GGML_OP_ROPE_BACK
+struct test_rope : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    int n_dims;
+    int mode;
+    int n_ctx; // used to generate positions
+    float fs; // freq_scale
+    float ef; // ext_factor
+    float af; // attn_factor
+    bool ff;
+    int v; // view (1 : non-contiguous a)
+    bool forward;
+    bool inplace;
+
+    std::string vars() override {
+        // forward can be inferred from the op, does not need to be printed
+        return VARS_TO_STR11(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v, inplace);
+    }
+
+    test_rope(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {10, 5, 3, 1},
+            int n_dims = 10, int mode = GGML_ROPE_TYPE_NORMAL, int n_ctx = 512, float fs = 1.0f,
+            float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0, bool forward = true, bool inplace = false)
+        : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v), forward(forward), inplace(inplace) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            if (forward) {
+                ggml_set_param(a);
+            }
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            if (forward) {
+                ggml_set_param(a);
+            }
+            ggml_set_name(a, "a");
+        }
+
+        const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+        const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+        ggml_tensor * pos;
+        if (is_mrope || is_vision) {
+            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2] * 4);
+        } else {
+            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
+        }
+        ggml_set_name(pos, "pos");
+
+        ggml_tensor * freq = nullptr;
+        if (ff) {
+            freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2);
+            ggml_set_name(freq, "freq");
+        }
+
+        ggml_tensor * out;
+        if (is_mrope) {
+            if (is_vision) {
+                GGML_ASSERT(n_dims/4 > 0);
+                int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
+                if (forward) {
+                    if (inplace) {
+                        out = ggml_rope_multi_inplace(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                    } else {
+                        out = ggml_rope_multi(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                    }
+                } else {
+                    out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                }
+            } else {
+                GGML_ASSERT(n_dims/3 > 0);
+                int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
+                if (forward) {
+                    if (inplace) {
+                        out = ggml_rope_multi_inplace(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                    } else {
+                        out = ggml_rope_multi(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                    }
+                } else {
+                    out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                }
+            }
+        } else {
+            if (forward) {
+                if (inplace) {
+                    out = ggml_rope_ext_inplace(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                } else {
+                    out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+                }
+            } else {
+                out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+            }
+
+            // TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp
+        }
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                // pos
+                const int num_pos_ids = (mode & GGML_ROPE_TYPE_MROPE) ? ne_a[2] * 4 : ne_a[2];
+                std::vector<int> data(num_pos_ids);
+                for (int i = 0; i < num_pos_ids; i++) {
+                    data[i] = rand() % n_ctx;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
+            } else {
+                if (t->ne[0] == n_dims/2) {
+                    // frequency factors in the range [0.9f, 1.1f]
+                    init_tensor_uniform(t, 0.9f, 1.1f);
+                } else {
+                    init_tensor_uniform(t);
+                }
+            }
+        }
+    }
+
+    double max_maa_err() override {
+        return 1e-3;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_POOL2D
+struct test_pool2d : public test_case {
+    enum ggml_op_pool pool_type;
+    const ggml_type type_input;
+    const std::array<int64_t, 4> ne_input;
+    // kernel size
+    const int k0;
+    const int k1;
+    // stride
+    const int s0;
+    const int s1;
+    // padding
+    const int p0;
+    const int p1;
+
+    std::string vars() override {
+        return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1);
+    }
+
+    test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG,
+            ggml_type type_input = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
+            int k0 = 3, int k1 = 3,
+            int s0 = 1, int s1 = 1,
+            int p0 = 1, int p1 = 1)
+        : pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), k1(k1), s0(s0), s1(s1), p0(p0), p1(p1) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
+        ggml_set_param(input);
+        ggml_set_name(input, "input");
+
+        ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_POOL1D
+struct test_pool1d : public test_case {
+    enum ggml_op_pool pool_type;
+    const ggml_type type_input;
+    const std::array<int64_t, 4> ne_input;
+    const int k0;
+    const int s0;
+    const int p0;
+
+    std::string vars() override {
+        return VARS_TO_STR6(pool_type, type_input, ne_input, k0, s0, p0);
+    }
+
+    test_pool1d(ggml_op_pool pool_type = GGML_OP_POOL_AVG,
+                ggml_type type_input = GGML_TYPE_F32,
+                std::array<int64_t,4> ne_input = {10, 1, 1, 1},
+                int k0 = 3, int s0 = 3, int p0 = 0)
+        : pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), s0(s0), p0(p0) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
+        ggml_set_param(input);
+        ggml_set_name(input, "input");
+
+        ggml_tensor * out = ggml_pool_1d(ctx, input, pool_type, k0, s0, p0);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_CONV_TRANSPOSE_1D
+struct test_conv_transpose_1d : public test_case {
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+
+    const int s0; // stride
+    const int p0; // padding
+    const int d0; // dilation
+
+    std::string vars() override {
+        return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
+    }
+
+    test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_channels, 1 /* assert in cpu kernel*/, 1 (should be batch)]
+                           std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, output_channels, input_channels, 1 (should be batch)]
+                           int s0 = 1, int p0 = 0, int d0 = 1)
+        : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_CONV_TRANSPOSE_2D
+struct test_conv_transpose_2d : public test_case {
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+    const int stride;
+
+    std::string vars() override {
+        return VARS_TO_STR3(ne_input, ne_kernel, stride);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4; // The default 1e-7 is too small for Vulkan.
+    }
+
+    test_conv_transpose_2d(std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
+                           std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
+                           int stride = 1)
+        : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride){}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * out = ggml_conv_transpose_2d_p0(ctx, kernel, input, stride);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_IM2COL
+struct test_im2col : public test_case {
+    const ggml_type type_input;
+    const ggml_type type_kernel;
+    const ggml_type dst_type;
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+    // stride
+    const int s0;
+    const int s1;
+    // padding
+    const int p0;
+    const int p1;
+    // dilation
+    const int d0;
+    const int d1;
+    // mode
+    const bool is_2D;
+
+    std::string vars() override {
+        return VARS_TO_STR12(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
+    }
+
+    test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
+            std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
+            int s0 = 1, int s1 = 1,
+            int p0 = 1, int p1 = 1,
+            int d0 = 1, int d1 = 1,
+            bool is_2D = true)
+        : type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
+        ggml_set_param(input);
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_IM2COL_3D
+struct test_im2col_3d : public test_case {
+    const ggml_type type_input;
+    const ggml_type type_kernel;
+    const ggml_type dst_type;
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+    // stride
+    const int s0;
+    const int s1;
+    const int s2;
+    // padding
+    const int p0;
+    const int p1;
+    const int p2;
+    // dilation
+    const int d0;
+    const int d1;
+    const int d2;
+
+    const int64_t IC;
+    const bool v;
+
+    std::string vars() override {
+        return VARS_TO_STR16(type_input, type_kernel, dst_type, ne_input, ne_kernel, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, v);
+    }
+
+    test_im2col_3d(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
+                std::array<int64_t, 4> ne_input = {10, 10, 10, 9}, // [OC*IC, KD, KH, KW]
+                std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [N*IC, ID, IH, IW]
+                int64_t IC = 3,
+                int s0 = 1, int s1 = 1, int s2 = 1,
+                int p0 = 1, int p1 = 1, int p2 = 1,
+                int d0 = 1, int d1 = 1, int d2 = 1,
+                bool v = false)
+        : type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), s2(s2), p0(p0), p1(p1), p2(p2), d0(d0), d1(d1), d2(d2), IC(IC), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
+        ggml_set_param(input);
+        ggml_set_name(input, "input");
+
+        if (v) {
+            input = ggml_view_4d(ctx, input, ne_input[0] - 2, ne_input[1] - 2, ne_input[2] - 2, ne_input[3] - 2, input->nb[1], input->nb[2], input->nb[3], 0);
+            ggml_set_name(input, "view_of_input");
+        }
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * out = ggml_im2col_3d(ctx, kernel, input, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, dst_type);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// CONV_2D
+struct test_conv_2d : public test_case {
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+    const ggml_type              type_kernel;
+    const int                    stride0;
+    const int                    stride1;
+    const int                    padding0;
+    const int                    padding1;
+    const int                    dilation0;
+    const int                    dilation1;
+    // Whether the inputs are contiguous in the channel dim or the width dim
+    const bool                   cwhn;
+
+    // If true, the direct CONV_2D will be used in the graph, otherwise it
+    // uses ggml_conv_2d:
+    // * if the program is called with -o CONV_2D_DIRECT_IMPL, the
+    // CONV_2D graph will be built, while
+    // * if the program is called with -o CONV_2D_INDIRECT_IMPL, the
+    // IM2COL -> MUL_MM graph will be built.
+
+    std::string vars() override {
+        return VARS_TO_STR10(ne_input, ne_kernel, type_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        // Just counting matmul costs:
+        // KxCRS @ CRSxNPQ = KxNPQ --> KxNPQx(CRS+CRS-1) flops
+
+        // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
+        auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
+            return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
+        };
+
+        int64_t W    = ne_input[0];
+        int64_t H    = ne_input[1];
+        int64_t KW   = ne_kernel[0];
+        int64_t KH   = ne_kernel[1];
+        int64_t Cin  = ne_kernel[2];
+        int64_t Cout = ne_kernel[3];
+        int64_t N    = ne_input[3];
+        int64_t OH   = calc_conv_output_size(H, KH, stride0, padding0, dilation0);
+        int64_t OW   = calc_conv_output_size(W, KW, stride0, padding0, dilation0);
+
+        int64_t K   = Cout;
+        int64_t CRS = Cin * KH * KW;
+        int64_t NPQ = N * OH * OW;
+
+        return K * NPQ * (2 * CRS - 1);
+    }
+
+    test_conv_2d(std::array<int64_t, 4> ne_input  = { 64, 64, 16, 1 },
+                 std::array<int64_t, 4> ne_kernel = { 3, 3, 1, 16 }, ggml_type type_kernel = GGML_TYPE_F32, int stride0 = 1,
+                 int stride1 = 1, int padding0 = 0, int padding1 = 0, int dilation0 = 1, int dilation1 = 1, bool cwhn = false) :
+        ne_input(ne_input),
+        ne_kernel(ne_kernel),
+        type_kernel(type_kernel),
+        stride0(stride0),
+        stride1(stride1),
+        padding0(padding0),
+        padding1(padding1),
+        dilation0(dilation0),
+        dilation1(dilation1),
+        cwhn(cwhn) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        if (cwhn) {
+            // change memory layout to channel-most-contiguous (CWHN),
+            // then permute it back so NE matches the original input
+            input  = ggml_cont(ctx, ggml_permute(ctx, input, 1, 2, 0, 3));
+            input  = ggml_permute(ctx, input, 2, 0, 1, 3);
+            kernel = ggml_cont(ctx, ggml_permute(ctx, kernel, 2, 3, 1, 0));
+            kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1);
+        }
+
+        ggml_tensor * out =
+            ggml_conv_2d_direct(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
+// GGML_OP_CONV_2D_DW
+struct test_conv_2d_dw : public test_case {
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+    const int stride;
+    const int padding;
+    const int dilation;
+    const bool cwhn;
+
+    std::string vars() override {
+        return VARS_TO_STR6(ne_input, ne_kernel, stride, padding, dilation, cwhn);
+    }
+
+    test_conv_2d_dw(std::array<int64_t, 4> ne_input = {64, 64, 16, 1},
+            std::array<int64_t, 4> ne_kernel = {3, 3, 1, 16},
+            int stride = 1, int padding = 0, int dilation = 1, bool cwhn = false)
+        : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride), padding(padding), dilation(dilation), cwhn(cwhn) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        if (cwhn) {
+            // change memory layout to channel-most-contiguous (CWHN),
+            // then permute it back so NE matches the original input
+            input = ggml_cont(ctx, ggml_permute(ctx, input, 1, 2, 0, 3));
+            input = ggml_permute(ctx, input, 2, 0, 1, 3);
+            kernel = ggml_cont(ctx, ggml_permute(ctx, kernel, 2, 3, 1, 0));
+            kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1);
+        }
+
+        ggml_tensor * out = ggml_conv_2d_dw_direct(
+            ctx, kernel, input,
+            stride, stride, padding, padding, dilation, dilation);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
+// GGML_OP_CONV_3D
+struct test_conv_3d : public test_case {
+    // Logical 5D dimensions
+    const int64_t N, IC, ID, IH, IW;
+    const int64_t OC, KD, KH, KW;
+    // Conv params
+    const int s0, s1, s2;
+    const int p0, p1, p2;
+    const int d0, d1, d2;
+    // Types
+    const ggml_type type_kernel;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "CONV_3D";
+    }
+
+    std::string vars() override {
+        return VARS_TO_STR11(N, IC, ID, IH, IW, OC, KD, KH, KW, s0, s1) + "," +
+               VARS_TO_STR8(s2, p0, p1, p2, d0, d1, d2, type_kernel);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
+            return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
+        };
+        const int64_t OD = calc_conv_output_size(ID, KD, s2, p2, d2);
+        const int64_t OH = calc_conv_output_size(IH, KH, s1, p1, d1);
+        const int64_t OW = calc_conv_output_size(IW, KW, s0, p0, d0);
+
+        return (uint64_t)N * OC * OD * OH * OW * (2 * IC * KD * KH * KW - 1);
+    }
+
+    test_conv_3d(
+        int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW,
+        int64_t OC, int64_t KD, int64_t KH, int64_t KW,
+        int s0, int s1, int s2,
+        int p0, int p1, int p2,
+        int d0, int d1, int d2,
+        ggml_type type_kernel
+    ) : N(N), IC(IC), ID(ID), IH(IH), IW(IW),
+        OC(OC), KD(KD), KH(KH), KW(KW),
+        s0(s0), s1(s1), s2(s2),
+        p0(p0), p1(p1), p2(p2),
+        d0(d0), d1(d1), d2(d2),
+        type_kernel(type_kernel) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // GGML input tensor is packed as [W, H, D, C*N]
+        const int64_t ne_input[] = {IW, IH, ID, IC * N};
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input);
+        ggml_set_name(input, "input");
+
+        // GGML kernel tensor is packed as [KW, KH, KD, IC*OC]
+        const int64_t ne_kernel[] = {KW, KH, KD, IC * OC};
+        ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel);
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * out = ggml_conv_3d_direct(ctx, kernel, input, s0, s1, s2, p0, p1, p2, d0, d1, d2, (int)IC, (int)N, (int)OC);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
+// GGML_OP_CONCAT
+struct test_concat : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const int64_t ne_b_d;
+    const int dim;
+    const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
+    }
+
+    test_concat(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {10, 5, 5, 5},
+            int64_t ne_b_d = 5,
+            int dim = 2, int v = 0)
+        : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        auto ne_b = ne_a;
+        ne_b[dim] = ne_b_d;
+        ggml_tensor * a;
+        if (v & 1) {
+            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
+            a = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_name(a, "a");
+
+            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view_of_a");
+        } else {
+            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+            ggml_set_name(a, "a");
+        }
+        ggml_tensor * b;
+        if (v & 2) {
+            auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
+            b = ggml_new_tensor(ctx, type, 4, ne.data());
+            ggml_set_name(b, "b");
+
+            b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
+            ggml_set_name(b, "view_of_b");
+        } else {
+            b = ggml_new_tensor(ctx, type, 4, ne_b.data());
+            ggml_set_name(b, "b");
+        }
+
+        ggml_tensor * out = ggml_concat(ctx, a, b, dim);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_ARGSORT
+struct test_argsort : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    ggml_sort_order order;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, order);
+    }
+
+    test_argsort(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {16, 10, 10, 10},
+            ggml_sort_order order = GGML_SORT_ORDER_ASC)
+        : type(type), ne(ne), order(order) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_argsort(ctx, a, order);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                // indices
+                std::vector<int> data(ggml_nelements(t));
+                for (int i = 0; i < ggml_nelements(t); i++) {
+                    data[i] = rand();
+                }
+                std::shuffle(data.begin(), data.end(), rng);
+                ggml_backend_tensor_set(t, data.data(), 0, ne[0]*ne[1]*ne[2]*ne[3] * sizeof(int));
+            } else if (t->type == GGML_TYPE_F32) {
+                // initialize with unique values to avoid ties
+                for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                    std::vector<float> data(t->ne[0]);
+                    for (int i = 0; i < t->ne[0]; i++) {
+                        data[i] = i;
+                    }
+                    std::shuffle(data.begin(), data.end(), rng);
+                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
+                }
+            } else {
+                GGML_ABORT("fatal error");
+            }
+        }
+    }
+};
+
+// GGML_OP_TOP_K
+struct test_top_k : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const int k;
+    const bool ties;
+    ggml_tensor * input {};
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, k, ties);
+    }
+
+    test_top_k(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {16, 10, 10, 10},
+            int k = 4, bool ties = false)
+        : type(type), ne(ne), k(k), ties(ties) {}
+
+    double max_err() override {
+        return 0.0;
+    }
+
+    // When there are ties, only validate the final result.
+    // The logic in err can't handle the sentinel tensors.
+    bool run_whole_graph() override { return ties; }
+
+    double err(const float * a, const float * b, size_t n) override {
+        // When there are no ties, we expect the exact same set of indices,
+        // but possibly in a different order. When there are ties, the indices
+        // can be different but the input values they correspond to should be
+        // the same. The logic for ties could work for non-ties, but only for
+        // the output tensor, not for the sentinel tensors.
+        if (ties) {
+            std::vector<float> src(ggml_nelements(input));
+
+            ggml_backend_tensor_get(input, src.data(), 0, ggml_nelements(input) * ggml_type_size(type));
+
+            double diff = 0.0f;
+
+            GGML_ASSERT(n == (size_t)(ggml_nrows(input) * k));
+            int64_t cols = input->ne[0];
+            std::vector<int32_t> ia(k);
+            std::vector<int32_t> ib(k);
+            std::vector<float> asrc(k);
+            std::vector<float> bsrc(k);
+            for (int64_t r = 0; r < ggml_nrows(input); r++) {
+                // Convert indices for the row back to integer
+                for (int64_t c = 0; c < k; c++) {
+                    ia[c] = (int32_t)a[r * k + c];
+                    ib[c] = (int32_t)b[r * k + c];
+                }
+                // The src values for each row should match.
+                for (int64_t c = 0; c < k; c++) {
+                    asrc[c] = src[r * cols + ia[c]];
+                    bsrc[c] = src[r * cols + ib[c]];
+                }
+                diff += jdst(asrc.data(), bsrc.data(), k);
+                // There should be no duplicate indices
+                std::sort(ia.begin(), ia.end());
+                std::sort(ib.begin(), ib.end());
+                if (std::adjacent_find(ia.begin(), ia.end()) != ia.end()) {
+                    diff += 1;
+                }
+                if (std::adjacent_find(ib.begin(), ib.end()) != ib.end()) {
+                    diff += 1;
+                }
+            }
+            return diff;
+        } else {
+            std::vector<int32_t> ia(n);
+            std::vector<int32_t> ib(n);
+
+            double diff = 0.0f;
+
+            for (size_t i = 0; i < n; i++) {
+                ia[i] = (int32_t) a[i];
+                ib[i] = (int32_t) b[i];
+
+                // penalize the result if the data is not integer valued
+                diff += std::fabs(a[i] - ia[i]);
+                diff += std::fabs(b[i] - ib[i]);
+            }
+
+            return diff + jdst(ia.data(), ib.data(), n);
+        }
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        // Save 'a' for err()
+        input = a;
+
+        ggml_tensor * out = ggml_top_k(ctx, a, k);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        std::random_device rd;
+        std::default_random_engine rng(rd());
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            int tie_denom = std::max(1, std::min(10, k / 2));
+            for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                std::vector<float> data(t->ne[0]);
+                for (int i = 0; i < t->ne[0]; i++) {
+                    if (ties) {
+                        // integer division to introduce duplicates
+                        data[i] = i / tie_denom;
+                    } else {
+                        data[i] = i;
+                    }
+                }
+                std::shuffle(data.begin(), data.end(), rng);
+                ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
+            }
+        }
+    }
+};
+
+enum MoeGatingFunc {
+    GATING_FUNC_SOFTMAX,
+    GATING_FUNC_SIGMOID,
+    GATING_FUNC_SOFTMAX_WEIGHT,
+};
+
+struct test_topk_moe : public test_case {
+    const std::array<int64_t, 4> ne;
+    const int n_expert_used;
+    const bool with_norm;
+    const bool bias_probs;
+    const MoeGatingFunc gating_func;
+    const float scale_w;
+    ggml_tensor * weights {};
+    ggml_tensor * selected_experts {};
+
+    test_topk_moe(std::array<int64_t, 4> ne              = { 10, 5, 1, 1 },
+                  int                    n_expert_used   = 1,
+                  bool                   with_norm       = false,
+                  bool                   bias_probs      = false,
+                  MoeGatingFunc          gating_func     = GATING_FUNC_SOFTMAX,
+                  float                  scale_w         = 0.0f) :
+        ne(ne),
+        n_expert_used(n_expert_used),
+        with_norm(with_norm),
+        bias_probs(bias_probs),
+        gating_func(gating_func),
+        scale_w(scale_w) {
+        GGML_ASSERT(n_expert_used <= ne[0]);
+    }
+
+    std::string vars() override { return VARS_TO_STR6(ne, n_expert_used, with_norm, bias_probs, gating_func, scale_w); }
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "TOPK_MOE";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int n_expert = ne[0];
+        const int n_tokens = ne[1];
+
+        ggml_tensor * logits = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
+        ggml_tensor * probs            =
+            (gating_func == GATING_FUNC_SOFTMAX) ? ggml_soft_max(ctx, logits) :
+            (gating_func == GATING_FUNC_SIGMOID) ? ggml_sigmoid(ctx, logits) : logits;
+        ggml_set_name(probs, "probs");
+
+        ggml_tensor * selection_probs = probs;
+        if (bias_probs) {
+            ggml_tensor * exp_probs_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]);
+            ggml_set_name(exp_probs_b, "exp_probs_b");
+            selection_probs = ggml_add(ctx, probs, exp_probs_b);
+            ggml_set_name(selection_probs, "selection_probs");
+        }
+
+        selected_experts = ggml_argsort_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+        ggml_set_name(selected_experts, "selected_experts");
+
+        weights = ggml_get_rows(ctx, ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
+        ggml_set_name(weights, "weights");
+
+        if (gating_func == GATING_FUNC_SOFTMAX_WEIGHT) {
+            weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
+            weights = ggml_soft_max(ctx, weights);  // [n_expert_used, n_tokens]
+            weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
+        }
+
+        if (with_norm) {
+            weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
+            ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
+            ggml_set_name(weights_sum, "weights_sum");
+
+            weights_sum = ggml_clamp(ctx, weights_sum, 6.103515625e-5, INFINITY);
+            weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
+            weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
+        }
+
+        if (scale_w) {
+            weights = ggml_scale(ctx, weights, scale_w);
+        }
+
+        ggml_set_name(weights, "weights");
+        return weights;
+    }
+    // Verify two outputs
+    std::vector<ggml_tensor *> fusion_test_nodes() override { return { selected_experts, weights }; }
+
+    // allow output in arbitrary order
+    double err(const float * a, const float * b, size_t n) override {
+        std::vector<float> a2(n);
+        std::vector<float> b2(n);
+        for (size_t i = 0; i < n; ++i) {
+            a2[i] = a[i];
+            b2[i] = b[i];
+        }
+        std::sort(a2.begin(), a2.end());
+        std::sort(b2.begin(), b2.end());
+        return nmse(a2.data(), b2.data(), n);
+    }
+};
+
+struct test_mul_mat_vec_fusion : public test_case {
+    const ggml_type type;
+    const ggml_glu_op glu_op;
+    const int64_t m;
+    const int64_t n;
+    const int64_t k;
+    const bool use_id;
+    const int n_mats;
+    const int n_used;
+    const bool b;        // broadcast b matrix (only for use_id)
+    const bool with_bias;
+    const bool with_gate;
+    std::array<int64_t, 2> batch_dims;
+
+    test_mul_mat_vec_fusion(ggml_type type, ggml_glu_op op, int64_t m, int64_t n, int64_t k,
+                        bool use_id = false, int n_mats = 1, int n_used = 1, bool b = false, bool with_bias = false, bool with_gate = true,
+                        std::array<int64_t, 2> batch_dims = {4, 2})
+    : type(type), glu_op(op), m(m), n(n), k(k), use_id(use_id), n_mats(n_mats), n_used(n_used), b(b), with_bias(with_bias), with_gate(with_gate), batch_dims(batch_dims) {
+        if (use_id) {
+            GGML_ASSERT(n_used <= n_mats);
+        }
+    }
+
+    std::string vars() override {
+        return VARS_TO_STR12(type, glu_op, m, n, k, use_id, n_mats, n_used, b, with_bias, with_gate, batch_dims);
+    }
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "MUL_MAT_VEC_FUSION";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    ggml_tensor * build_gate(ggml_context * ctx, ggml_tensor * ffn_gate, ggml_tensor * ffn_up) {
+        ggml_tensor * out = nullptr;
+        if (with_gate) {
+            if (glu_op == GGML_GLU_OP_SWIGLU_OAI) {
+                constexpr float alpha = 1.702f;
+                constexpr float limit = 7.0f;
+                out = ggml_swiglu_oai(ctx, ffn_gate, ffn_up, alpha, limit);
+            } else {
+                out = ggml_glu_split(ctx, ffn_gate, ffn_up, glu_op);
+            }
+        }
+        return out;
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        if (!use_id) {
+            const int              channels = batch_dims[0];
+            const int              samples  = batch_dims[1];
+            std::array<int64_t, 4> ne       = { k, m, channels, samples };
+            std::array<int64_t, 4> ne0      = { k, n, channels, samples };
+
+            ggml_tensor * cur  = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
+            ggml_tensor * gate = with_gate ? ggml_new_tensor(ctx, type, 4, ne0.data()) : nullptr;
+            ggml_tensor * up   = ggml_new_tensor(ctx, type, 4, ne0.data());
+
+            ggml_tensor * ffn_up = ggml_mul_mat(ctx, up, cur);
+            if (with_bias) {
+                std::array<int64_t, 4> bias_ne = { ffn_up->ne[0], 1, channels, samples };
+                ggml_tensor * up_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data());
+                ffn_up = ggml_add(ctx, ffn_up, up_bias);
+            }
+
+            ggml_tensor * ffn_gate = with_gate ? ggml_mul_mat(ctx, gate, cur) : nullptr;
+            if (with_bias && with_gate) {
+                std::array<int64_t, 4> bias_ne   = { ffn_gate->ne[0], 1, channels, samples };
+                ggml_tensor * gate_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data());
+                ffn_gate = ggml_add(ctx, ffn_gate, gate_bias);
+            }
+
+            ggml_tensor * out = with_gate ? build_gate(ctx, ffn_gate, ffn_up) : ffn_up;
+
+            std::array<int64_t, 4> bias2_ne   = { out->ne[0], 1, channels, samples };
+            ggml_tensor * bias2 = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias2_ne.data());
+            out = ggml_add(ctx, out, bias2);
+
+            ggml_set_name(out, "out");
+            return out;
+        } else {
+            ggml_tensor * gates = ggml_new_tensor_3d(ctx, type, k, n, n_mats);
+            ggml_tensor * ups   = ggml_new_tensor_3d(ctx, type, k, n, n_mats);
+            ggml_tensor * ids   = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, m);
+
+            if (n_used != n_mats) {
+                ids = ggml_view_2d(ctx, ids, n_used, m, ids->nb[1], 0);
+            }
+
+            ggml_tensor * cur = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, k, this->b ? 1 : n_used, m);
+            ggml_set_name(cur, "cur");
+
+            ggml_tensor * ffn_up = ggml_mul_mat_id(ctx, ups, cur, ids);
+            if (with_bias) {
+                ggml_tensor * up_bias_param = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ffn_up->ne[0], n_mats);
+                ffn_up = ggml_add_id(ctx, ffn_up, up_bias_param, ids);
+            }
+
+            ggml_tensor * ffn_gate = with_gate? ggml_mul_mat_id(ctx, gates, cur, ids) : nullptr;
+            if (with_bias && with_gate) {
+                ggml_tensor * gate_bias_param = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ffn_gate->ne[0], n_mats);
+                ffn_gate = ggml_add_id(ctx, ffn_gate, gate_bias_param, ids);
+            }
+
+            ggml_tensor * out = with_gate ? build_gate(ctx, ffn_gate, ffn_up) : ffn_up;
+
+            std::array<int64_t, 4> scale_ne { 1, out->ne[1], out->ne[2], out->ne[3] };
+            ggml_tensor * scale = ggml_new_tensor(ctx, out->type, 4, scale_ne.data());
+            out = ggml_mul(ctx, out, scale);
+
+            ggml_set_name(out, "out");
+            return out;
+        }
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        if (!use_id) {
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+                init_tensor_uniform(t);
+            }
+        } else {
+            init_mul_mat_id_tensors(ctx, n_mats);
+        }
+    }
+
+    double max_nmse_err() override {
+        return 5e-3;
+    }
+};
+
+// GGML_OP_SUM
+struct test_sum : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> permute;
+    bool _use_permute;
+
+    std::string vars() override {
+        std::string v = VARS_TO_STR2(type, ne);
+        if (_use_permute) v += "," + VAR_TO_STR(permute);
+        return v;
+    }
+
+    test_sum(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            std::array<int64_t, 4> permute = {0, 0, 0, 0})
+        : type(type), ne(ne), permute(permute),
+            _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        if (_use_permute) {
+            a = ggml_permute(ctx, a, permute[0], permute[1], permute[2], permute[3]);
+            ggml_set_name(a, "a_permuted");
+        }
+
+        ggml_tensor * out = ggml_sum(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 0.1f * sqrtf(ne[0]*ne[1]*ne[2]*ne[3]);
+    }
+
+    // Don't center the distribution around zero. Helps to avoid catastrophic cancellation.
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -0.9f, 1.1f);
+        }
+    }
+};
+
+// GGML_OP_SUM_ROWS
+struct test_sum_rows : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const bool permute;
+    const bool slice;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, permute, slice);
+    }
+
+    test_sum_rows(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            bool permute = false, bool slice = false)
+        : type(type), ne(ne), permute(permute), slice(slice) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        if (slice) {
+            a = ggml_view_4d(ctx, a,
+                             ne[0], ne[1], ne[2] / 2, ne[3] - 1,
+                             a->nb[1], a->nb[2] * 2, a->nb[3], /*offset=*/a->nb[3]);
+        }
+        if (permute) {
+            a = ggml_permute(ctx, a, 0, 2, 3, 1);
+        }
+
+        ggml_tensor * out = ggml_sum_rows(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_MEAN
+struct test_mean : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_mean(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_mean(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    float grad_eps() override {
+        return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
+    }
+
+    // Don't center the distribution around zero. Helps to avoid catastrophic cancellation.
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -0.9f, 1.1f);
+        }
+    }
+};
+
+// GGML_OP_UPSCALE
+struct test_upscale : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const int32_t scale_factor;
+    const bool transpose;
+    const ggml_scale_mode mode;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, ne, scale_factor, mode, transpose);
+    }
+
+    test_upscale(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {512, 512, 3, 1},
+            int32_t scale_factor = 2, ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST, bool transpose = false)
+        : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose), mode(mode) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        if (transpose) {
+            a = ggml_transpose(ctx, a);
+            ggml_set_name(a, "a_transposed");
+        }
+
+        ggml_tensor * out = ggml_upscale(ctx, a, scale_factor, mode);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_UPSCALE (via ggml_interpolate)
+struct test_interpolate : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> ne_tgt;
+    const ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, ne_tgt, mode);
+    }
+
+    test_interpolate(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne     = {2, 5,  7, 11},
+            std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13},
+            ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST)
+        : type(type), ne(ne), ne_tgt(ne_tgt), mode(mode) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_interpolate(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3], mode);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_GROUP_NORM
+struct test_group_norm : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const int32_t num_groups;
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, num_groups, eps);
+    }
+
+    test_group_norm(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 64, 320, 1},
+            int32_t num_groups = 32,
+            float eps = 1e-6f)
+        : type(type), ne(ne), num_groups(num_groups), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_GROUP_NORM + GGML_OP_MUL + GGML_OP_ADD
+struct test_group_norm_mul_add : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    int num_groups;
+    float eps;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "GROUP_NORM_MUL_ADD";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne, num_groups, eps);
+    }
+
+    test_group_norm_mul_add(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {128, 1, 1, 1},
+            int num_groups = 4,
+            float eps = 1e-5f)
+        : type(type), ne(ne), num_groups(num_groups), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_tensor * w = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a); ggml_set_param(w); ggml_set_param(b);
+        ggml_set_name(a, "a"); ggml_set_name(w, "w"); ggml_set_name(b, "b");
+        ggml_tensor * n = ggml_group_norm(ctx, a, num_groups, eps);
+        ggml_tensor * m = ggml_mul(ctx, n, w);
+        ggml_tensor * out = ggml_add(ctx, m, b);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
+// GGML_OP_L2_NORM
+struct test_l2_norm : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_l2_norm(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {64, 64, 320, 1},
+            float eps = 1e-12f)
+        : type(type), ne(ne), eps(eps) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_l2_norm(ctx, a, eps);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_ACC
+struct test_acc : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const std::array<int64_t, 4> ne_b;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne_a, ne_b);
+    }
+
+    test_acc(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {256, 17, 1, 1},
+            std::array<int64_t, 4> ne_b = {256, 16, 1, 1})
+        : type(type), ne_a(ne_a), ne_b(ne_b) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
+        ggml_set_param(b);
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_PAD
+struct test_pad : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const int pad_0;
+    const int pad_1;
+    const bool circular;
+
+    std::string vars() override {
+        return VARS_TO_STR5(type, ne_a, pad_0, pad_1, circular);
+    }
+
+    test_pad(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
+            int pad_0 = 1, int pad_1 = 1, bool circular = false)
+        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1), circular(circular) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = circular
+            ? ggml_pad_circular(ctx, a, pad_0, pad_1, 0, 0)
+            : ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_PAD (with extension)
+struct test_pad_ext : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const int lp0;
+    const int rp0;
+    const int lp1;
+    const int rp1;
+    const int lp2;
+    const int rp2;
+    const int lp3;
+    const int rp3;
+    const int tfrm; // 0 - none, 1 - non-cont, 2 - perm
+    const bool circular;
+
+    std::string vars() override {
+        return VARS_TO_STR12(type, ne_a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, tfrm, circular);
+    }
+
+    test_pad_ext(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {512, 512, 3, 1},
+            int lp0 = 1, int rp0 = 1, int lp1 = 1, int rp1 = 1,
+            int lp2 = 1, int rp2 = 1, int lp3 = 1, int rp3 = 1,
+            int tfrm = 0, bool circular = false)
+        : type(type), ne_a(ne_a), lp0(lp0), rp0(rp0), lp1(lp1), rp1(rp1), lp2(lp2), rp2(rp2), lp3(lp3), rp3(rp3),
+          tfrm(tfrm), circular(circular) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_name(a, "a");
+
+        if (tfrm == 1) {
+            a = ggml_view_4d(ctx, a, (a->ne[0] + 1) / 2, (a->ne[1] + 1) / 2, (a->ne[2] + 1) / 2, (a->ne[3] + 1) / 2, a->nb[1], a->nb[2], a->nb[3], 0);
+            ggml_set_name(a, "view of a");
+        } else if (tfrm == 2) {
+            a = ggml_permute(ctx, a, 2, 1, 0, 3);
+            ggml_set_name(a, "permuted a");
+        }
+
+        ggml_tensor * out = circular
+            ? ggml_pad_ext_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
+            : ggml_pad_ext         (ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_PAD_REFLECT_1D
+struct test_pad_reflect_1d : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const int pad_0;
+    const int pad_1;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
+    }
+
+    test_pad_reflect_1d(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {512, 34, 2, 1},
+            int pad_0 = 10, int pad_1 = 9)
+        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1)  {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 2, ne_a.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_pad_reflect_1d(ctx, a, pad_0, pad_1);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_ROLL
+struct test_roll : public test_case {
+    const int shift0;
+    const int shift1;
+    const int shift3;
+    const int shift4;
+
+    std::string vars() override {
+        return VARS_TO_STR4(shift0, shift1, shift3, shift4);
+    }
+
+    test_roll(int shift0 = 3, int shift1 = -2, int shift3 = 1, int shift4 = -1)
+        : shift0(shift0), shift1(shift1), shift3(shift3), shift4(shift4) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        int64_t ne[4] = {10, 5, 4, 3};
+        ggml_tensor * a = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_roll(ctx, a, shift0, shift1, shift3, shift4);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_ARANGE
+struct test_arange : public test_case {
+    const ggml_type type;
+    const float start;
+    const float stop;
+    const float step;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, start, stop, step);
+    }
+
+    test_arange(ggml_type type = GGML_TYPE_F32,
+            float start = 0.f, float stop = 10.f, float step = 1.f)
+        : type(type), start(start), stop(stop), step(step)  {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * out = ggml_arange(ctx, start, stop, step);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_TIMESTEP_EMBEDDING
+struct test_timestep_embedding : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const int dim;
+    const int max_period;
+
+    std::string vars() override {
+        return VARS_TO_STR4(type, ne_a, dim, max_period);
+    }
+
+    test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
+            int dim = 320, int max_period=10000)
+        : type(type), ne_a(ne_a), dim(dim), max_period(max_period)  {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_LEAKY_RELU
+struct test_leaky_relu : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    const float negative_slope;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne_a, negative_slope);
+    }
+
+    test_leaky_relu(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {10, 5, 4, 3},
+            float negative_slope = 0.1f)
+        : type(type), ne_a(ne_a), negative_slope(negative_slope)  {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_FLASH_ATTN_EXT
+struct test_flash_attn_ext : public test_case {
+    const int64_t hsk; // K head size
+    const int64_t hsv; // V head size
+    const int64_t nh; // num heads
+    const std::array<int64_t, 2> nr23; // repeat in dim 2 and 3, tests for grouped-query attention
+    const int64_t kv; // kv size
+    const int64_t nb; // batch size
+
+    const bool mask; // use mask
+    const bool sinks; // use sinks
+
+    const float max_bias; // ALiBi
+    const float logit_softcap; // Gemma 2
+
+    const ggml_prec prec;
+    const ggml_type type_KV;
+    std::array<int32_t, 4> permute;
+
+    std::string vars() override {
+        return VARS_TO_STR13(hsk, hsv, nh, nr23, kv, nb, mask, sinks, max_bias, logit_softcap, prec, type_KV, permute);
+    }
+
+    double max_nmse_err() override {
+        return 5e-4;
+    }
+
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        // Just counting matmul costs:
+        // Q*K^T is nb x hsk x kv, P*V is nb x kv x hsv, per head
+        return (2 * nh*nr23[0] * nb * (hsk + hsv) * kv)*nr23[1];
+    }
+
+    test_flash_attn_ext(int64_t hsk = 128, int64_t hsv = 128, int64_t nh = 32, std::array<int64_t, 2> nr23 = {1, 1}, int64_t kv = 96, int64_t nb = 8,
+                        bool mask = true, bool sinks = false, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_prec prec = GGML_PREC_F32,
+                        ggml_type type_KV = GGML_TYPE_F16, std::array<int32_t, 4> permute = {0, 1, 2, 3})
+        : hsk(hsk), hsv(hsv), nh(nh), nr23(nr23), kv(kv), nb(nb), mask(mask), sinks(sinks), max_bias(max_bias), logit_softcap(logit_softcap), prec(prec), type_KV(type_KV), permute(permute) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const int64_t hsk_padded = GGML_PAD(hsk, ggml_blck_size(type_KV));
+        const int64_t hsv_padded = GGML_PAD(hsv, ggml_blck_size(type_KV));
+
+        auto const &create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, bool is_view) -> ggml_tensor * {
+            int64_t ne[4] = {ne0, ne1, ne2, ne3};
+            int64_t ne_perm[4];
+            for (int i = 0; i < 4; ++i) {
+                ne_perm[permute[i]] = ne[i];
+            }
+            ggml_tensor * t;
+            if (is_view) {
+                ggml_tensor * t0 = ggml_new_tensor_4d(ctx, type, ne_perm[0], 2*ne_perm[1], ne_perm[2], ne_perm[3]);
+                t = ggml_view_4d(ctx, t0, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3], t0->nb[1], t0->nb[2], t0->nb[3], 0);
+            } else {
+                t = ggml_new_tensor_4d(ctx, type, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3]);
+            }
+            if (permute != std::array<int32_t, 4>{0, 1, 2, 3}) {
+                t = ggml_permute(ctx, t, permute[0], permute[1], permute[2], permute[3]);
+            }
+            return t;
+        };
+
+        ggml_tensor * q = create_permuted(GGML_TYPE_F32, hsk_padded, nb, nh*nr23[0], nr23[1], false);
+        ggml_set_name(q, "q");
+
+        ggml_tensor * k = create_permuted(type_KV,       hsk_padded, kv, nh,         nr23[1], true); // the K tensor is usually a view of the K cache
+        ggml_set_name(k, "k");
+
+        ggml_tensor * v = nullptr;
+        if (hsk_padded == 576 && hsv_padded == 512) {
+            // TODO: this branch should become a separate test case parameter instead of hardcoding this for these head shapes
+
+            // in this branch, the V cache is sub-view of the K cache. this is used by some MLA-based models
+            // for more info:
+            //   - https://github.com/ggml-org/llama.cpp/pull/13435
+            //   - https://github.com/ggml-org/llama.cpp/pull/18953#issuecomment-3774948392
+            //   - https://github.com/ggml-org/llama.cpp/pull/18986
+            v = ggml_view_4d(ctx, k, hsv_padded, kv, nh, nr23[1], k->nb[1], k->nb[2], k->nb[3], 0);
+        } else {
+            v = create_permuted(type_KV,       hsv_padded, kv, nh,         nr23[1], true); // the V tensor is usually a view of the V cache
+        }
+        ggml_set_name(v, "v");
+
+        ggml_tensor * m = nullptr;
+        if (mask) {
+            m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, nb, 1, nr23[1]);
+            ggml_set_name(m, "m");
+        }
+
+        ggml_tensor * s = nullptr;
+        if (sinks) {
+            s = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, q->ne[2]);
+            ggml_set_name(s, "s");
+        }
+
+        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hsk), max_bias, logit_softcap);
+        ggml_flash_attn_ext_add_sinks(out, s);
+        ggml_flash_attn_ext_set_prec (out, prec);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (strcmp(t->name, "s") == 0) {
+                // make the sink values more noticable in order to trigger a test failure when the implementation is wrong
+                init_tensor_uniform(t, -10.0f, 10.0f);
+            } else if (strcmp(t->name, "m") == 0) {
+                init_tensor_kq_mask(t);
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_CROSS_ENTROPY_LOSS
+struct test_cross_entropy_loss : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(logits);
+        ggml_set_name(logits, "logits");
+
+        ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
+        // The labels are assumed to be constant -> no gradients.
+        ggml_set_name(labels, "labels");
+
+        // Ensure labels add up to 1:
+        labels = ggml_soft_max(ctx, labels);
+        ggml_set_name(labels, "labels_normalized");
+
+        ggml_tensor * out = ggml_cross_entropy_loss(ctx, logits, labels);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        // For larger abs. diffs between logits softmax is more linear, therefore more precise num. gradients.
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -100.0f, 100.0f);
+        }
+    }
+
+    float grad_eps() override {
+        return 1.0f;
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_CROSS_ENTROPY_LOSS_BACK
+struct test_cross_entropy_loss_back : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_cross_entropy_loss_back(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * grad = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(logits, "logits");
+
+        ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(labels, "labels");
+
+        // Ensure labels add up to 1:
+        labels = ggml_soft_max(ctx, labels);
+        ggml_set_name(labels, "labels_normalized");
+
+        ggml_tensor * out = ggml_cross_entropy_loss_back(ctx, grad, logits, labels);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_OPT_STEP_ADAMW
+struct test_opt_step_adamw : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(a); // Despite tensor a having gradients the output tensor will not.
+        ggml_set_name(a, "a");
+
+        ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * grad_m = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad_m, "grad_m");
+
+        ggml_tensor * grad_v = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad_v, "grad_v");
+
+        ggml_tensor * adamw_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 7);
+        ggml_set_name(adamw_params, "adamw_params");
+
+        ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, grad_m, grad_v, adamw_params);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, 0.0f, 1.0f); // grad_v and adamw_params need non-negative values.
+        }
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_OPT_STEP_SGD
+struct test_opt_step_sgd : public test_case {
+    const ggml_type              type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override { return VARS_TO_STR2(type, ne); }
+
+    test_opt_step_sgd(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = { 10, 5, 4, 3 })
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(a);  // Despite tensor a having gradients the output tensor will not.
+        ggml_set_name(a, "a");
+
+        ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_name(grad, "grad");
+
+        ggml_tensor * sgd_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
+        ggml_set_name(sgd_params, "sgd_params");
+
+        ggml_tensor * out = ggml_opt_step_sgd(ctx, a, grad, sgd_params);
+
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, 0.0f, 1.0f);  // sgd_params need non-negative values.
+        }
+    }
+
+    bool grad_precise() override {
+        return true;
+    }
+};
+
+// GGML_OP_CUMSUM
+struct test_cumsum : public test_case {
+    const ggml_type              type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override { return VARS_TO_STR2(type, ne); }
+
+    test_cumsum(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = { 10, 5, 4, 3 })
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_cumsum(ctx, a);
+
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -1.0f, 1.0f);
+        }
+    }
+};
+
+// GGML_OP_XIELU
+struct test_xielu : public test_case {
+    const ggml_type              type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override { return VARS_TO_STR2(type, ne); }
+
+    test_xielu(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = { 10, 5, 4, 3 })
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        float alpha_n = 4.0f;
+        float alpha_p = 20.0f;
+        float beta = 0.5f;
+        float eps = 0.0000001f;
+
+        ggml_tensor * out = ggml_xielu(ctx, a, alpha_n, alpha_p, beta, eps);
+
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -1.0f, 1.0f);
+        }
+    }
+};
+
+// GGML_OP_TRI
+struct test_tri : public test_case {
+    const ggml_type              type;
+    const std::array<int64_t, 4> ne;
+    const ggml_tri_type          tri_type;
+
+    std::string vars() override { return VARS_TO_STR3(type, ne, tri_type); }
+
+    test_tri(ggml_tri_type tri_type, ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = { 10, 10, 4, 3 })
+        : type(type), ne(ne), tri_type(tri_type) {
+            GGML_ASSERT(ne[0] == ne[1]);
+        }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_tri(ctx, a, tri_type);
+
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -1.0f, 1.0f);
+        }
+    }
+};
+
+// GGML_OP_FILL
+struct test_fill : public test_case {
+    const ggml_type              type;
+    const std::array<int64_t, 4> ne;
+    float                        c;
+
+    std::string vars() override { return VARS_TO_STR3(type, ne, c); }
+
+    test_fill(float c, ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = { 10, 10, 4, 3 })
+        : type(type), ne(ne), c(c) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_fill(ctx, a, c);
+
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+// GGML_OP_SOLVE_TRI
+struct test_solve_tri : public test_case {
+    const ggml_type              type;
+    const std::array<int64_t, 4> ne_lhs;
+    const std::array<int64_t, 4> ne_rhs;
+
+    std::string vars() override { return VARS_TO_STR3(type, ne_lhs, ne_rhs); }
+
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        int64_t n = ne_lhs[0];
+        int64_t k = ne_rhs[0];
+        int64_t batch = ne_lhs[2] * ne_lhs[3];
+        // n * (n + 1) / 2 non-zero elements of lhs, 2 flops each, for each col of rhs
+        return n * (n + 1) * k * batch;
+    }
+
+    test_solve_tri(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_lhs = { 10, 10, 4, 3 },
+            std::array<int64_t, 4> ne_rhs = { 3, 10, 4, 3 }
+        )
+        : type(type), ne_lhs(ne_lhs), ne_rhs(ne_rhs) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne_lhs[0], ne_lhs[1], ne_lhs[2], ne_lhs[3]);
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne_rhs[0], ne_rhs[1], ne_rhs[2], ne_rhs[3]);
+        ggml_set_param(b);
+        ggml_set_name(b, "b");
+
+        ggml_tensor * out = ggml_solve_tri(ctx, a, b, true, true, false);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (strcmp(t->name, "a") == 0) {
+                // note: avoid zeros in the diagonal
+                init_tensor_tril(t, 0.1, 1.0f);
+            } else {
+                init_tensor_uniform(t, -1.0f, 1.0f);
+            }
+        }
+    }
+};
+
+// GGML_OP_DIAG
+struct test_diag : public test_case {
+    const ggml_type              type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override { return VARS_TO_STR2(type, ne); }
+
+    test_diag(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = { 10, 1, 4, 3 })
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        GGML_ASSERT(ne[1] == 1);
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_diag(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
+
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+};
+
+struct llama_hparams {
+    uint32_t n_vocab;
+    uint32_t n_embd;
+    uint32_t n_head;
+    uint32_t n_head_kv;
+    static constexpr uint32_t n_layer = 1;
+    uint32_t n_rot;
+    uint32_t n_embd_head; // dimension of values (d_v)
+    uint32_t n_ff;
+
+    float f_norm_eps;
+    float f_norm_rms_eps;
+
+    // cparams
+    static constexpr uint32_t n_ctx = 512; // user-specified context size
+    static constexpr uint32_t n_ctx_orig = n_ctx;
+
+    // batch
+    int32_t n_tokens;
+
+    // llm_build_context
+    static constexpr int32_t n_kv    = 32; // size of KV cache to consider (n_kv <= n_ctx
+    static constexpr int32_t kv_head = 1;  // index of where we store new KV data in the cache
+
+    uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
+        return n_embd_head * n_head_kv;
+    }
+};
+
+// LLM base class
+struct test_llm : public test_case {
+    llama_hparams hp;
+
+protected:
+    test_llm(llama_hparams hp)
+        : hp(std::move(hp)) {
+    }
+
+public:
+    struct ggml_tensor * llm_build_norm(
+            struct ggml_context * ctx,
+             struct ggml_tensor * cur,
+             struct ggml_tensor * mw,
+             struct ggml_tensor * mb,
+                  llm_norm_type   type) {
+        switch (type) {
+            case LLM_NORM:     cur = ggml_norm    (ctx, cur, hp.f_norm_eps); break;
+            case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
+        }
+        cur = ggml_mul(ctx, cur, mw);
+        if (mb) {
+            cur = ggml_add(ctx, cur, mb);
+        }
+        return cur;
+    }
+
+    void llm_build_kv_store(
+            struct ggml_context * ctx,
+             struct ggml_tensor * k_l,
+             struct ggml_tensor * v_l,
+             struct ggml_tensor * k_cur,
+             struct ggml_tensor * v_cur) {
+        // compute the transposed [n_tokens, n_embd] V matrix
+        struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
+
+        struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
+                (ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
+
+        struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
+                (  hp.n_ctx)*ggml_element_size(v_l),
+                (hp.kv_head)*ggml_element_size(v_l));
+
+        // important: storing RoPE-ed version of K in the KV cache!
+        ggml_cpy(ctx, k_cur,   k_cache_view);
+        ggml_cpy(ctx, v_cur_t, v_cache_view);
+    }
+
+    struct ggml_tensor * llm_build_kqv(
+            struct ggml_context * ctx,
+             struct ggml_tensor * k_l,
+             struct ggml_tensor * v_l,
+             struct ggml_tensor * q_cur,
+             struct ggml_tensor * kq_mask,
+                        float     kq_scale) {
+        struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
+
+        struct ggml_tensor * k =
+            ggml_view_3d(ctx, k_l,
+                    hp.n_embd_head, hp.n_kv, hp.n_head_kv,
+                    ggml_row_size(k_l->type, hp.n_embd_gqa()),
+                    ggml_row_size(k_l->type, hp.n_embd_head),
+                    0);
+
+        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+
+        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
+
+        // split cached v into n_head heads
+        struct ggml_tensor * v =
+            ggml_view_3d(ctx, v_l,
+                    hp.n_kv, hp.n_embd_head, hp.n_head_kv,
+                    ggml_element_size(v_l)*hp.n_ctx,
+                    ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
+                    0);
+
+        struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
+
+        struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
+
+        struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
+
+        struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
+        cur = ggml_mul_mat(ctx, wo, cur);
+
+        return cur;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                // pos
+                std::vector<int> data(hp.n_tokens);
+                for (int i = 0; i < hp.n_tokens; i++) {
+                    data[i] = rand() % hp.n_ctx;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
+// Llama
+struct test_llama : public test_llm {
+    static constexpr float freq_base = 10000.0f;
+    static constexpr float freq_scale = 1.0f;
+    static constexpr float ext_factor = 0.0f;
+    static constexpr float attn_factor = 1.0f;
+    static constexpr float beta_fast = 32.0f;
+    static constexpr float beta_slow = 1.0f;
+    bool fused;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "LLAMA";
+    }
+
+    std::string vars() override {
+        auto n_tokens = hp.n_tokens;
+        return VARS_TO_STR1(n_tokens);
+    }
+
+    double max_nmse_err() override {
+        return 2e-3;
+    }
+
+    bool run_whole_graph() override { return fused; }
+
+    test_llama(int n_tokens = 1, bool fused = false)
+        : test_llm({
+            /*n_vocab        =*/ 32000,
+            /*n_embd         =*/ 3200,
+            /*n_head         =*/ 32,
+            /*n_head_kv      =*/ 32,
+            /*n_rot          =*/ 100,
+            /*n_embd_head    =*/ 100,
+            /*n_ff           =*/ 8640,
+            /*f_norm_eps     =*/ 0.f,
+            /*f_norm_rms_eps =*/ 1e-5f,
+            /*n_tokens       =*/ n_tokens,
+        })
+        , fused(fused)
+    {
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
+
+        ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
+        ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
+
+        for (uint32_t il = 0; il < hp.n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+            cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
+
+            // self-attention
+            {
+                ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
+                ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
+                ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
+
+                Qcur = ggml_rope_ext(
+                    ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos, nullptr,
+                    hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+                Kcur = ggml_rope_ext(
+                    ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
+                    hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+                llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
+
+                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
+
+            // feed-forward network
+            ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+            cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
+
+            ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
+            ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff,   hp.n_embd);
+            ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
+            struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
+            cur = ggml_mul_mat(ctx, ffn_gate, cur);
+            cur = ggml_silu(ctx, cur);
+            cur = ggml_mul(ctx, cur, tmp);
+            cur = ggml_mul_mat(ctx, ffn_down, cur);
+
+            cur = ggml_add(ctx, cur, ffn_inp);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+        cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
+
+        // lm_head
+        ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
+        cur = ggml_mul_mat(ctx, output, cur);
+
+        return cur;
+    }
+};
+
+// Falcon
+struct test_falcon : public test_llm {
+    static constexpr float freq_base = 10000.0f;
+    static constexpr float freq_scale = 1.0f;
+    static constexpr float ext_factor = 0.0f;
+    static constexpr float attn_factor = 1.0f;
+    static constexpr float beta_fast = 32.0f;
+    static constexpr float beta_slow = 1.0f;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "FALCON";
+    }
+
+    std::string vars() override {
+        auto n_tokens = hp.n_tokens;
+        return VARS_TO_STR1(n_tokens);
+    }
+
+    double max_nmse_err() override {
+        return 2e-3;
+    }
+
+    test_falcon(int n_tokens = 1)
+        : test_llm({
+            /*n_vocab        =*/ 32000,
+            /*n_embd         =*/ 3200,
+            /*n_head         =*/ 50,
+            /*n_head_kv      =*/ 1,
+            /*n_rot          =*/ 64,
+            /*n_embd_head    =*/ 64,
+            /*n_ff           =*/ 8640,
+            /*f_norm_eps     =*/ 1e-5f,
+            /*f_norm_rms_eps =*/ 0.f,
+            /*n_tokens       =*/ n_tokens,
+        }) {
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
+
+        ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
+        ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
+
+        for (uint32_t il = 0; il < hp.n_layer; ++il) {
+            // norm
+            ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+            ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+            ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
+
+            // self-attention
+            {
+                cur = attn_norm;
+
+                ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
+
+                cur = ggml_mul_mat(ctx, wqkv, cur);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd,     hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
+
+                Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens);
+                Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
+
+                // using mode = 2 for neox mode
+                Qcur = ggml_rope_ext(
+                    ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+                Kcur = ggml_rope_ext(
+                    ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+                llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
+
+                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
+            }
+
+            struct ggml_tensor * ffn_inp = cur;
+
+            // feed forward
+            {
+                ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
+                ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
+                cur = attn_norm;
+                cur = ggml_mul_mat(ctx, ffn_up, cur);
+                cur = ggml_gelu(ctx, cur);
+                cur = ggml_mul_mat(ctx, ffn_down, cur);
+            }
+
+            cur = ggml_add(ctx, cur, ffn_inp);
+
+            cur = ggml_add(ctx, cur, inpL);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        ggml_tensor * output_norm   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+        ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
+        cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
+
+        // lm_head
+        ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
+        cur = ggml_mul_mat(ctx, output, cur);
+
+        return cur;
+    }
+};
+
+
+// ###########################################
+// ## Section 3: GGML Op Test Instantiation ##
+// ###########################################
+static const ggml_type all_types[] = {
+    GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
+    GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
+    GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_MXFP4,
+    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
+    GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
+    GGML_TYPE_Q6_K,
+    // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+    GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
+    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
+    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
+};
+
+static const ggml_type base_types[] = {
+    GGML_TYPE_F32, GGML_TYPE_F16,
+    GGML_TYPE_Q8_0, // for I8MM tests
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1, // for I8MM tests
+    GGML_TYPE_Q4_K,
+    GGML_TYPE_MXFP4, // TODO: or "other"
+    GGML_TYPE_IQ2_XXS
+};
+
+static const ggml_type other_types[] = {
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
+    GGML_TYPE_Q5_K,
+    GGML_TYPE_Q6_K,
+    // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
+    GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
+    GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
+    GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
+    GGML_TYPE_BF16,
+};
+
+#ifdef _MSC_VER
+// Workaround long compile time with msvc
+#pragma optimize("", off)
+#endif
+
+// Test cases for evaluation: should try to cover edge cases while using small input sizes to keep the runtime low
+static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+    std::vector<std::unique_ptr<test_case>> test_cases;
+    std::default_random_engine rng(0);
+
+    // unary ops
+    for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        for (int v : {0, 1}) {
+            for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
+                if (op == GGML_UNARY_OP_XIELU) {
+                    continue; // need extra params, separate test
+                }
+                test_cases.emplace_back(new test_unary((ggml_unary_op) op, type, { 128, 2, 2, 2 }, v));
+                test_cases.emplace_back(new test_unary((ggml_unary_op) op, type, { 5, 7, 11, 13 }, v));
+            }
+        }
+    }
+
+    // glu ops
+    for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        for (int v : {0, 1}) {
+            for (int op = 0; op < GGML_GLU_OP_COUNT; op++) {
+                if (op == GGML_GLU_OP_SWIGLU_OAI) {
+                    // SWIGLU_OAI is handled separately
+                    continue;
+                }
+
+                for (bool swapped : {false, true}) {
+                    test_cases.emplace_back(new test_glu((ggml_glu_op) op, type, { 128, 2, 2, 2 }, v, swapped));
+                    test_cases.emplace_back(new test_glu((ggml_glu_op) op, type, { 5, 7, 11, 13 }, v, swapped));
+                }
+
+                test_cases.emplace_back(new test_glu_split((ggml_glu_op) op, type, { 128, 2, 2, 2 }, v));
+                test_cases.emplace_back(new test_glu_split((ggml_glu_op) op, type, { 5, 7, 11, 13 }, v));
+            }
+        }
+    }
+
+    for (int v : {0, 1}) {
+        for (float alpha : {.5f, 1.702f}) {
+            for (float limit : {2.0f, 7.0f}) {
+                test_cases.emplace_back(new test_swiglu_oai(GGML_TYPE_F32, { 128, 2, 2, 2 }, v, alpha, limit));
+            }
+        }
+    }
+
+    for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_Q4_0}) {
+        test_cases.emplace_back(new test_get_rows(type, 300*256,   5,         4,   1,   2, false));
+        test_cases.emplace_back(new test_get_rows(type,     256,   80000, 70000,   2,   1, false));
+        test_cases.emplace_back(new test_get_rows(type,     256,   5,         4, 700, 100, false));
+    }
+
+    test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, 1, false));
+    for (ggml_type type : all_types) {
+        for (int b : {1, 7}) {
+            for (bool v : {false, true}) {
+                test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, 1, v));
+            }
+        }
+    }
+    for (int b : {1, 7}) {
+        for (bool v : {false, true}) {
+            test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, 1, v));
+        }
+    }
+
+    test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 8, 2, 1, false));
+    for (ggml_type type : all_types) {
+        for (bool v : {false, true}) {
+            test_cases.emplace_back(new test_get_rows_back(type, 256, 5, 4, 1, v));
+        }
+    }
+    for (bool v : {false, true}) {
+        test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_I32, 256, 5, 4, 1, v));
+    }
+
+    test_cases.emplace_back(new test_set_rows(GGML_TYPE_F32, GGML_TYPE_I64, { 1, 8, 1, 3 }, { 1, 1 }, 2, false));
+    test_cases.emplace_back(new test_set_rows(GGML_TYPE_F32, GGML_TYPE_I32, { 1, 8, 1, 3 }, { 1, 1 }, 2, false));
+    test_cases.emplace_back(new test_set_rows(GGML_TYPE_Q8_0, GGML_TYPE_I32, { 256, 5, 1, 3 }, { 1, 1, }, 1, false));
+    for (ggml_type type : all_types) {
+        for (int b : {1, 7}) {
+            for (bool v : {false, true}) {
+                test_cases.emplace_back(new test_set_rows(type, GGML_TYPE_I64, { 256, 5,  b, 3 }, { 1, 1, }, 1, v));
+                test_cases.emplace_back(new test_set_rows(type, GGML_TYPE_I64, { 256, 11, 1, b }, { 2, 3, }, 7, v));
+
+                test_cases.emplace_back(new test_set_rows(type, GGML_TYPE_I64, { 3*ggml_blck_size(type), 3, b, 1 }, { 2, 3, }, 2, v));
+
+                if (ggml_blck_size(type) == 1) {
+                    test_cases.emplace_back(new test_set_rows(type, GGML_TYPE_I64, { 31, 3, b, 1 }, { 2, 3, }, 2, v));
+                    test_cases.emplace_back(new test_set_rows(type, GGML_TYPE_I64, { 33, 5, 1, b }, { 2, 3, }, 1, v));
+                }
+            }
+        }
+    }
+
+    for (int mode : { GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_VISION }) {
+        for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+            for (int ne2 : {1, 8, 512}) {
+                test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, ne2, 1 }, mode));
+                test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, ne2, 3 }, mode));
+            }
+        }
+    }
+
+    for (ggml_type type_input : {GGML_TYPE_F32}) {
+        for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
+            for (int k0 : {1, 3}) {
+                for (int k1 : {1, 3}) {
+                    for (int s0 : {1, 2}) {
+                        for (int s1 : {1, 2}) {
+                            for (int p0 : {0, 1}) {
+                                for (int p1 : {0, 1}) {
+                                    test_cases.emplace_back(new test_pool2d(pool_type, type_input, {10, 10, 3, 1}, k0, k1, s0, s1, p0, p1));
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (ggml_type type_input : {GGML_TYPE_F32}) {
+        for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
+            for (int k0 : {1, 3}) {
+                for (int s0 : {1, 2}) {
+                    for (int p0 : {0, 1}) {
+                        test_cases.emplace_back(new test_pool1d(pool_type, type_input, { 10,  3, 2, 1 }, k0, s0, p0));
+                        test_cases.emplace_back(new test_pool1d(pool_type, type_input, { 11,  1, 3, 2 }, k0, s0, p0));
+                        test_cases.emplace_back(new test_pool1d(pool_type, type_input, { 128, 2, 1, 3 }, k0, s0, p0));
+                    }
+                }
+            }
+        }
+    }
+
+#if 0
+    // >4GB im2col destination. Too slow to run by default.
+    // Test cases taken from Wan2.1 T2V 1.3B.
+    test_cases.emplace_back(new test_im2col   (GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {832, 480, 192, 4}, {3, 3, 192, 96}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col_3d(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {834, 482, 6, 96},  {3, 3,3, 9216}, 96, 1, 1, 1, 0, 0, 0, 1, 1, 1, false));
+#endif
+
+    // im2col 1D
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
+    for (int s0 : {1, 3}) {
+        for (int p0 : {0, 3}) {
+            for (int d0 : {1, 3}) {
+                test_cases.emplace_back(new test_im2col(
+                    GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 2, 2, 1}, {3, 2, 2, 1},
+                    s0, 0, p0, 0, d0, 0, false));
+            }
+        }
+    }
+
+    // im2col 2D
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
+    for (int s0 : {1, 3}) {
+        for (int s1 : {1, 3}) {
+            for (int p0 : {0, 3}) {
+                for (int p1 : {0, 3}) {
+                    for (int d0 : {1, 3}) {
+                        for (int d1 : {1, 3}) {
+                            test_cases.emplace_back(new test_im2col(
+                                GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 20, 2, 2}, {3, 3, 2, 2},
+                                s0, s1, p0, p1, d0, d1, true));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // extra tests for im2col 2D
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 32}, {3, 3, 1, 32}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 32}, {3, 3, 2, 32}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 1024}, {3, 3, 1, 1024}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 1024}, {3, 3, 2, 1024}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2048}, {3, 3, 1, 2048}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2048}, {3, 3, 2, 2048}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {5, 5, 1, 32}, {3, 4, 1, 32}, 1, 1, 0, 0, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 1536, 729}, {2, 2, 1536, 4096}, 1, 1, 0, 0, 1, 1, true));
+
+    // im2col 3D
+    test_cases.emplace_back(new test_im2col_3d(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
+    test_cases.emplace_back(new test_im2col_3d(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
+    test_cases.emplace_back(new test_im2col_3d(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
+    for (int s0 : {1, 3}) {
+        for (int s1 : {1, 3}) {
+            for (int s2 : {1, 3}) {
+                for (int p0 : {0, 3}) {
+                    for (int p1 : {0, 3}) {
+                        for (int p2 : {0, 3}) {
+                            for (int d0 : {1, 3}) {
+                                for (int d1 : {1, 3}) {
+                                    for (int d2 : {1, 3}) {
+                                        for (int IC : {1, 3}) {
+                                            for (bool v : {false, true}) {
+                                                test_cases.emplace_back(new test_im2col_3d(
+                                                    GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 20, 10, 3}, {3, 3, 3, 3},
+                                                    IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, v));
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+// Conv_2D test cases
+#ifdef DETAILED_TESTS
+    // Probably we do not have enough time to execute these in the pipeline.
+    uint32_t iwh_idx  = 0;
+    uint32_t kwh_idx  = 1;
+    uint32_t Cout_idx = 2;
+    uint32_t Cin_idx  = 3;
+    uint32_t B_idx    = 4;
+
+    std::vector<std::array<int, 5>> cases = {
+  //{IWH, KWH, Cout, Cin, B}
+  // K=CRS=NPQ=4096 conv_2d matmul performance
+        {19,   4, 4096, 256, 16},
+ // K=128, CRS=128, NPQ=4096
+        { 19,  4, 128,  8,   16},
+ // K=130, CRS=128, NPQ=4096
+        { 19,  4, 130,  8,   16},
+ // Edge case: K x CRS is small
+        { 19,  2, 4,    4,   16},
+ // A ConvNet's first layer
+        { 224, 3, 8,    3,   1 },
+ // A ConvNet's first layer with 2x2 convolution, and 1 channel
+        { 224, 2, 8,    1,   1 },
+ // A ConvNet's first layer with 2x2 convolution, and 1 channel, several images in the batch
+        { 224, 2, 8,    1,   8 },
+ // A middle layer of a ConvNet
+        { 58,  3, 64,   32,  1 },
+ // A middle layer of a ConvNet, several images in the batch
+        { 58,  3, 64,   32,  8 },
+ // A deep layer of a ConvNet, several images in the batch
+        { 16,  3, 256,  128, 8 }
+    };
+
+    for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+        for (auto act_case : cases) {
+            test_cases.emplace_back(new test_conv_2d(
+                { act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
+                { act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] },
+                kernel_type, 1, 1, 0, 0, 1, 1, false));
+        }
+    }
+#endif
+
+    // CONV_2D:
+    auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
+        return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
+    };
+
+    //uint32_t s0 = 3;
+    uint32_t s1 = 5;
+    uint32_t p0 = 5;
+    //uint32_t p1 = 2;
+    uint32_t d0 = 2;
+    uint32_t d1 = 4;
+
+    for (uint32_t s0 : { 1, 3 }) {
+        for (uint32_t p1 : { 2, 5 }) {
+            for (uint32_t Cin : { 1, 25 }) {
+                for (uint32_t Cout : { 1, 12 }) {
+                    for (uint32_t KH : { 1, 2, 3, 11 }) {
+                        for (uint32_t KW : { 1, 2, 3, 11 }) {
+                            for (uint32_t H : { 1, 133 }) {
+                                for (uint32_t W : { 1, 141 }) {
+                                    if (calc_conv_output_size(W, KW, s0, p0, d0) > 0 &&
+                                        calc_conv_output_size(H, KH, s1, p1, d1) > 0) {
+                                        for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+                                            test_cases.emplace_back(new test_conv_2d(
+                                                { W, H, Cin, 2 }, { KW, KH, Cin, Cout }, kernel_type, s0, s1, p0, p1, d0, d1, false));
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // sycl backend will limit task global_range < MAX_INT
+    // test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
+    // however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
+    // these cases are verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
+    // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
+    // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
+
+    test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, false));
+    test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, true));
+    test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false));
+    test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true));
+
+    // CONV_3D
+    auto calc_conv_output_size_3d = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
+        return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
+    };
+
+    for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+        for (int N : {1, 2}) {
+            for (int IC : {1, 3}) {
+                for (int OC : {1, 4}) {
+                    for (int s0 : {1, 2}) {
+                        for (int p1 : {0, 1}) {
+                            for (int d2 : {1, 2}) {
+                                int64_t IW = 20, IH = 22, ID = 18;
+                                int64_t KW = 3,  KH = 3,  KD = 3;
+                                int s1 = s0, s2 = s0;
+                                int p0 = p1, p2 = p1;
+                                int d0 = d2, d1 = d2;
+
+                                if (calc_conv_output_size_3d(IW, KW, s0, p0, d0) <= 0 ||
+                                    calc_conv_output_size_3d(IH, KH, s1, p1, d1) <= 0 ||
+                                    calc_conv_output_size_3d(ID, KD, s2, p2, d2) <= 0) {
+                                    continue;
+                                }
+                                test_cases.emplace_back(new test_conv_3d(
+                                    N, IC, ID, IH, IW,
+                                    OC, KD, KH, KW,
+                                    s0, s1, s2, p0, p1, p2, d0, d1, d2,
+                                    kernel_type));
+
+                                // Asymmetric kernel and params
+                                int64_t asym_KW = 5, asym_KH = 1, asym_KD = 3;
+                                int asym_s0 = 2, asym_s1 = 1, asym_s2 = 1;
+                                int asym_p0 = 2, asym_p1 = 0, asym_p2 = 1;
+                                int asym_d0 = 1, asym_d1 = 1, asym_d2 = 2;
+
+                                if (calc_conv_output_size_3d(IW, asym_KW, asym_s0, asym_p0, asym_d0) <= 0 ||
+                                    calc_conv_output_size_3d(IH, asym_KH, asym_s1, asym_p1, asym_d1) <= 0 ||
+                                    calc_conv_output_size_3d(ID, asym_KD, asym_s2, asym_p2, asym_d2) <= 0) {
+                                    continue;
+                                }
+                                test_cases.emplace_back(new test_conv_3d(
+                                    N, IC, ID, IH, IW,
+                                    OC, asym_KD, asym_KH, asym_KW,
+                                    asym_s0, asym_s1, asym_s2, asym_p0, asym_p1, asym_p2, asym_d0, asym_d1, asym_d2,
+                                    kernel_type));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        // Case with kernel size 1
+        test_cases.emplace_back(new test_conv_3d(1, 4, 8, 8, 8, 8, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, kernel_type));
+    }
+
+    for(uint32_t Cout : {1, 9}){
+        for(uint32_t Cin : {1, 7}){
+            for(uint32_t K : {1, 3, 1337}){
+                for(uint32_t L : {1, 2, 13}){
+                    for(uint32_t s0: {1, 2, 3}){
+                        test_cases.emplace_back(new test_conv_transpose_1d({L,Cin,1,1}, {K,Cout,Cin,1}, s0, 0, 1));
+                    }
+                }
+            }
+        }
+    }
+
+    test_cases.emplace_back(new test_conv_transpose_1d());
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
+    test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
+
+    test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1));
+    test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
+    test_cases.emplace_back(new test_conv_transpose_2d({129, 63, 35, 1}, {3, 3, 48, 35}, 1));
+
+    test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4,  500, 1, 1}));
+    test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 5000, 1, 1}));
+
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32,    1, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32,  513, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100,  10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 12, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {2000, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {5438,  3, 1, 1}));
+
+    for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 2, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
+        test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2}));
+    }
+
+    for (bool view : {false, true}) {
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 1, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {2, 1, 1, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 2, 1, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 2, 1}, view));
+        test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 1, 2}, view));
+    }
+
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10,  8, 3, 1}, {1, 2, 0, 3}));
+
+    for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
+        test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
+    }
+
+    for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
+        test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim));
+    }
+
+    // same-type copy
+    for (ggml_type type : all_types) {
+        const auto nk = ggml_blck_size(type);
+
+        for (int k = 1; k < 4; ++k) {
+            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}));
+            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3}));
+            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3}));
+        }
+    }
+
+    for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
+        for (ggml_type type_dst : all_types) {
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+        }
+    }
+    for (ggml_type type_src : all_types) {
+        for (ggml_type type_dst : {GGML_TYPE_F32}) {
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+        }
+    }
+    for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
+        }
+    }
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
+
+    for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
+        for (bool use_view_slice : { true, false }) {
+            for (std::array<int64_t, 4> ne : std::initializer_list<std::array<int64_t, 4>>{ {2, 1, 1, 1}, {2, 1, 3, 5},
+                {2, 3, 5, 7}, {1, 4, 4, 1}, {1, 8, 17, 1}, {10, 10, 10, 1} }) {
+                if (use_view_slice && (type_dst == GGML_TYPE_F16 || type_dst == GGML_TYPE_BF16)) {
+                    continue; // TODO: add after WebGPU is fixed
+                }
+                test_cases.emplace_back(new test_cont(type_dst, ne, use_view_slice));
+            }
+        }
+    }
+
+    auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr, bool perm1 = false) {
+        for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) {
+            test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr, 1, perm1));
+        }
+    };
+    for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        for (bool perm1 : {false, true}) {
+            add_test_bin_bcast(type, {1,  1,   8,   1}, {1,  1, 1, 1}, perm1);
+            add_test_bin_bcast(type, {1,  1,   1,   1}, {32, 1, 1, 1}, perm1);
+            add_test_bin_bcast(type, {1,  1, 320, 320}, {1,  1, 1, 1}, perm1);
+            add_test_bin_bcast(type, {10, 5,   1,   1}, {1,  1, 1, 1}, perm1);
+            add_test_bin_bcast(type, {10, 5,   4,   1}, {1,  1, 1, 1}, perm1);
+            add_test_bin_bcast(type, {10, 5,   4,   3}, {1,  1, 1, 1}, perm1);
+            add_test_bin_bcast(type, {10, 5,   4,   3}, {2,  1, 1, 1}, perm1);
+            add_test_bin_bcast(type, {10, 5,   4,   3}, {1,  2, 1, 1}, perm1);
+            add_test_bin_bcast(type, {10, 5,   4,   3}, {1,  1, 2, 1}, perm1);
+            add_test_bin_bcast(type, {10, 5,   4,   3}, {1,  1, 1, 2}, perm1);
+            add_test_bin_bcast(type, {10, 5,   4,   3}, {1,  1, 2, 2}, perm1);
+            add_test_bin_bcast(type, {10, 5,   4,   3}, {1,  2, 2, 2}, perm1);
+            add_test_bin_bcast(type, {10, 5,   4,   3}, {2,  2, 2, 2}, perm1);
+        }
+
+        // test case for k_bin_bcast_unravel in CUDA backend
+        add_test_bin_bcast(type, {1, 1, 65536, 1}, {256, 1, 1, 1});
+
+        // stable diffusion
+        add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 1, 1, 1});
+        add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 16, 16, 1});
+        add_test_bin_bcast(type, {1280, 16, 16, 1}, {1, 1, 1, 1});
+        add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 256, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 1280, 1}, {16, 16, 1, 1});
+        add_test_bin_bcast(type, {16, 16, 1280, 1}, {1, 1, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 1920, 1}, {16, 16, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 2560, 1}, {16, 16, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 1280, 1}, {32, 32, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 1920, 1}, {32, 32, 1, 1});
+        add_test_bin_bcast(type, {1, 1, 640, 1}, {32, 32, 1, 1});
+        add_test_bin_bcast(type, {5120, 1, 1, 1}, {1, 256, 1, 1});
+        add_test_bin_bcast(type, {640, 1, 1, 1}, {1, 1, 1, 1});
+        add_test_bin_bcast(type, {64, 262144, 1, 1}, {1, 1, 1, 1});
+        //add_test_bin_bcast(type, {3, 3, 2560, 1280}, {1, 1, 1, 1});
+        //add_test_bin_bcast(type, {3, 3, 2560, 1280}, {2, 1, 1, 1});
+    }
+
+    // single inplace tests, especially important for WebGPU backend since kernels for inplace vs. not are different
+    test_cases.emplace_back(new test_bin_bcast(ggml_add_inplace, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16));
+    test_cases.emplace_back(new test_bin_bcast(ggml_mul_inplace, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16));
+    test_cases.emplace_back(new test_bin_bcast(ggml_sub_inplace, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16));
+    test_cases.emplace_back(new test_bin_bcast(ggml_div_inplace, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16));
+
+    // fusion
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1}, 2));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 2, 1, 1}, 3));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1}, 4));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 2}, 5));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 2}, 6));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 2, 2}, 7));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {2, 2, 2, 2}, 8));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16));
+
+    test_cases.emplace_back(new test_add1());
+    test_cases.emplace_back(new test_add1(GGML_TYPE_F32, {1024, 1024, 1, 1}));
+    test_cases.emplace_back(new test_scale());
+    test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f));
+    test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f, true)); // inplace test
+    test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {100, 10, 10, 10}, 2.0f, 1.0f));
+    test_cases.emplace_back(new test_softcap(GGML_TYPE_F32, {10, 10, 10, 10}, 50.0f));
+    test_cases.emplace_back(new test_silu_back());
+
+    for (float eps : { 0.0f, 1e-6f, 1e-4f, 1e-1f }) {
+        for (uint32_t n : { 64, 1025 }) {
+            for (bool v : { false, true }) {
+                test_cases.emplace_back(new test_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps));
+                test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps));
+            }
+            test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, { n, 5, 4, 3 }, eps));
+            test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps));
+        }
+    }
+
+    // in-place tests
+    test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, false, 1e-6f, true));
+
+    for (float eps : { 0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f }) {
+        for (uint32_t n : { 64, 1025 }) {
+            test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false));
+            test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true));
+            test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false));
+            test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true));
+            test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false));
+            test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true));
+        }
+    }
+    for (uint32_t n : {1, 511, 1025, 8192, 33*512}) {
+        for (bool multi_add : {false, true}) {
+            test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {n, 1, 1, 1}, 1e-6f, false, multi_add));
+        }
+        test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {n, 1, 1, 1}, 1e-6f, false));
+    }
+
+    for (auto multi_add : {false, true}) {
+        for (auto set_rows : {false, true}) {
+            for (auto rope : {GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX}) {
+                test_cases.emplace_back(new test_rms_norm_mul_rope({768, 1, 1, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({768, 3, 1, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({768, 3, 5, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({128, 32, 2, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({128, 4, 2, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({128, 32, 50, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({128, 4, 50, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({8192, 2, 2, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({8192, 2, 2, 1}, 1e-6f, multi_add, set_rows, rope));
+            }
+        }
+    }
+    for (int64_t d_conv : {3, 4, 9}) {
+        for (int64_t d_inner: {1024, 1536, 2048}) {
+            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
+            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {2 * d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
+            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 4, 1}, {d_conv, d_inner, 1, 1}));
+        }
+    }
+
+    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1, 1024, 1, 32, 4)); // Mamba-1
+    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 16, 2, 32, 4)); // Mamba-2
+    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 256, 64,  8, 2, 32, 4)); // Falcon-H1
+
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 1, 1));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 1));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4));
+    test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4));
+
+    test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 1, 1));
+    test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 32, 1));
+    test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 32, 4));
+    test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 128, 4));
+
+    test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 1, 1));
+    test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 1));
+    test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 4));
+    test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 128, 4));
+
+#if 0
+    // > 4GB A matrix. Too slow to be enabled by default.
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16,  900000,  3, 2592, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 1700000, 96, 2592, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 1700000,  3, 2592, {1, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 1700000,  1, 2592, {1, 1}, {1, 1}));
+
+    test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_Q8_0, GGML_TYPE_F32, 128, 128, false, 8192, 2, 5120)); // Llama-4-Maverick-17B-128E-PAB-Q8_0
+    test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_Q8_0, GGML_TYPE_F32, 128, 128, false, 8192, 1, 5120)); // Llama-4-Maverick-17B-128E-PAB-Q8_0
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q8_0, GGML_TYPE_F32, 8192, 1, 5120, {128, 1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q8_0, GGML_TYPE_F32, 8192, 512, 5120, {128, 1}, {1, 1}));
+#endif
+
+    for (ggml_type type_a : all_types) {
+        for (int i = 1; i < 10; ++i) {
+            test_cases.emplace_back(new test_mul_mat(type_a,    GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+        }
+    }
+
+#if 0
+    {
+        // Test paths in OpenCL
+        std::vector<int> ns = {32, 64, 128, 256, 512, 1024, 4096};
+        std::vector<int> ks = {896, 1536, 4096};
+        for (auto n : ns) {
+            for (auto k : ks) {
+                test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q8_0, GGML_TYPE_F32, 1024, n, k, {1, 1}, {1, 1}));
+            }
+        }
+    }
+#endif
+
+#if 1
+    for (ggml_type type_a : base_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+            std::vector<int> ks = { 256 };
+            if (ggml_blck_size(type_a) == 1) {
+                ks.push_back(4);
+            }
+            for (auto k : ks) {
+                // test cases without permutation
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {1, 1}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {1, 1}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {1, 1}, {1, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 1}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 1}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 2}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 2}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 2}, {1, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 2}, {2, 2}));
+
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {1, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 1}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 1}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {1, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {2, 2}));
+
+                // test cases with permutation
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, k, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, k, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, k, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+            }
+
+            // test cases with large ne00/ne10 to cover stream-k fixup
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 1024, {3, 2}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 1024, {3, 2}, {1, 1}));
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 1024, {3, 2}, {1, 1}));
+
+            // test cases with large batch size
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {1536, 1}, {1, 1}));
+        }
+    }
+    for (ggml_type type_a : other_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32}) {
+            if (ggml_blck_size(type_a) != 256) {
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1,  1}, {1, 1}));
+            }
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1,  1}, {1, 1}));
+        }
+    }
+#else
+    // m = a rows
+    // n = b rows
+    // k = cols
+    std::uniform_int_distribution<> dist_m(1, 128);
+    std::uniform_int_distribution<> dist_n(16, 128);
+    std::uniform_int_distribution<> dist_k(1, 16);
+    for (int i = 0; i < 1000; i++) {
+        for (ggml_type type_a : all_types) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                int m = dist_m(rng);
+                int n = dist_n(rng);
+                int k = dist_k(rng) * ggml_blck_size(type_a);
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1,  1}, {1, 1}));
+            }
+        }
+    }
+#endif
+
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,  128, { 8,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,  128, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,   64, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,   64, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 45, 128, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1,  1}, {4, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67,  {1,  1}, {4, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1,  1}, {1, 1}, {0, 1, 2, 3}, 64, 3));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 64, 77, 77, {12,1}, {1,1}));
+
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 576, 512, 576, {1,1}, {1,1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 1, 2048, 8192, {1,  1}, {1, 1}));
+    for (ggml_type type_a : all_types) {
+        test_cases.emplace_back(new test_mul_mat(type_a, GGML_TYPE_F32, 1, 64, 256, {1,  1}, {1, 1}));
+    }
+
+#if 0
+    // test the mat-mat path for Metal
+    for (int k = 1; k < 512; ++k) {
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 127, k, {12,1}, {1,1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 64, 127, k, {12,1}, {1,1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 77, k, {12,1}, {1,1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 64, 77, k, {12,1}, {1,1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 128, k, {12,1}, {1,1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 64, 128, k, {12,1}, {1,1}));
+        test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, false, 50, 200, k));
+        test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, true, 50, 200, k));
+        test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F32, GGML_TYPE_F32, 16, 16, false, 50, 200, k));
+        test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F32, GGML_TYPE_F32, 16, 16, true, 50, 200, k));
+    }
+#endif
+
+    for (auto bs2 : {1,3}) {
+        for (auto bs : {1,2,4,8}) {
+            for (auto nr : {1,4}) {
+                for (uint32_t m = 0; m < 2; ++m) {
+                    for (uint32_t k = 0; k < 2; ++k) {
+                        for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
+                            test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k,  {bs,  bs2}, {nr, 1}, {0, 2, 1, 3}));
+                            test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m,  1, 1056 + k, {bs,  bs2}, {nr, 1}, {0, 1, 2, 3}, 2*1056 + k));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // sycl backend will limit task global_range < MAX_INT
+    // test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
+    // however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
+    // this case is verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
+    // test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1}));
+
+    // test large experts*tokens
+    for (bool b : {false, true}) {
+        test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, b, 32, 1024, 16));
+        test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 2, 2, b, 32, 8192, 64));
+        test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, b, 50, 200, 64));
+    }
+
+    test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 1, 1, false, 8, 16, 1));
+    test_cases.emplace_back(new test_mul_mat_id_fusion(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, false, 32, 32, 32, 3));
+
+    // gpt-oss issue with Vulkan mmq_id
+    test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_MXFP4, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
+
+    for (ggml_type type_a : base_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
+            for (int n_mats : {4, 8}) {
+                for (int n_used : {1, 2, 4}) {
+                    for (bool b : {false, true}) {
+                        for (int n : {1, 4, 5, 17, 32, 129}) {
+                            int m = 512;
+                            int k = 256;
+                            test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (ggml_type type_a : other_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
+            for (int n_mats : {4}) {
+                for (int n_used : {2}) {
+                    for (bool b : {false}) {
+                        for (int n : {1, 32}) {
+                            int m = 512;
+                            int k = 256;
+                            test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (int bs : {1, 4, 512}) {
+        for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q4_K}) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                // test with mul after (ffn_moe_weighted)
+                test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 128, 8, false, 768, bs, 2048, 1, true));
+            }
+        }
+    }
+
+    for (ggml_type type_a : base_types) {
+        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+            for (int n : {1, 16}) {
+                for (int k : {1, 16}) {
+                    for (int bs2 : {1, 3}) {
+                        for (int bs3 : {1, 3}) {
+                            for (int nr2 : {1, 2}) {
+                                for (int nr3 : {1, 2}) {
+                                    test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, n, k, {bs2, bs3}, {nr2, nr3}));
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // add_id
+    for (ggml_type type_a : {GGML_TYPE_F32}) {
+        for (ggml_type type_b : {GGML_TYPE_F32}) {
+            for (int n_mats : {4, 8}) {
+                for (int n_used : {1, 2, 4}) {
+                    for (int n_embd : {32, 129}) {
+                        for (int n_token : {1, 32, 129}) {
+                            test_cases.emplace_back(new test_add_id(type_a, type_b, n_embd, n_mats, n_used, n_token));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        test_cases.emplace_back(new test_sqr       (type));
+        test_cases.emplace_back(new test_sqrt      (type));
+        test_cases.emplace_back(new test_log       (type));
+        test_cases.emplace_back(new test_sin       (type));
+        test_cases.emplace_back(new test_cos       (type));
+        test_cases.emplace_back(new test_clamp     (type));
+        test_cases.emplace_back(new test_leaky_relu(type));
+        test_cases.emplace_back(new test_floor     (type));
+        test_cases.emplace_back(new test_ceil      (type));
+        test_cases.emplace_back(new test_round     (type));
+        test_cases.emplace_back(new test_trunc     (type));
+        test_cases.emplace_back(new test_sqr       (type, {7, 1, 5, 3}));
+        test_cases.emplace_back(new test_sqr       (type, {1024, 1024, 1, 1}));
+        test_cases.emplace_back(new test_sqrt      (type, {7, 1, 5, 3}));
+        test_cases.emplace_back(new test_sqrt      (type, {1024, 1024, 1, 1}));
+        test_cases.emplace_back(new test_log       (type, {7, 1, 5, 3}));
+        test_cases.emplace_back(new test_log       (type, {1024, 1024, 1, 1}));
+        test_cases.emplace_back(new test_sin       (type, {7, 1, 5, 3}));
+        test_cases.emplace_back(new test_sin       (type, {1024, 1024, 1, 1}));
+        test_cases.emplace_back(new test_cos       (type, {7, 1, 5, 3}));
+        test_cases.emplace_back(new test_cos       (type, {1024, 1024, 1, 1}));
+        test_cases.emplace_back(new test_clamp     (type, {7, 1, 5, 3}));
+        test_cases.emplace_back(new test_clamp     (type, {1024, 1024, 1, 1}));
+        test_cases.emplace_back(new test_leaky_relu(type, {7, 1, 5, 3}));
+        test_cases.emplace_back(new test_leaky_relu(type, {1024, 1024, 1, 1}));
+        test_cases.emplace_back(new test_floor     (type, {7, 1, 5, 3}));
+        test_cases.emplace_back(new test_floor     (type, {1024, 1024, 1, 1}));
+        test_cases.emplace_back(new test_ceil      (type, {7, 1, 5, 3}));
+        test_cases.emplace_back(new test_ceil      (type, {1024, 1024, 1, 1}));
+        test_cases.emplace_back(new test_round     (type, {7, 1, 5, 3}));
+        test_cases.emplace_back(new test_round     (type, {1024, 1024, 1, 1}));
+        test_cases.emplace_back(new test_trunc     (type, {7, 1, 5, 3}));
+        test_cases.emplace_back(new test_trunc     (type, {1024, 1024, 1, 1}));
+    }
+
+    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
+    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 1}, 5));
+    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 2}, 5));
+
+#if 0
+    std::uniform_int_distribution<> dist_ne1(1, 50);
+    int exponent = 1;
+    while (exponent < (1 << 17)) {
+        std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
+
+        for (int n = 0; n < 10; ++n) {
+            int64_t ne0 = dist_ne0(rng);
+            int64_t ne1 = dist_ne1(rng);
+            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
+        }
+
+        exponent <<= 1;
+    }
+#endif
+    for (bool mask : {false, true}) {
+        for (bool sinks : {false, true}) {
+            for (float max_bias : {0.0f, 8.0f}) {
+                if (!mask && max_bias > 0.0f) continue;
+                for (float scale : {1.0f, 0.1f}) {
+                    for (int64_t ne0 : {16, 1024}) {
+                        for (int64_t ne1 : {16, 1024}) {
+                            if (mask) {
+                                for (ggml_type m_prec : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+                                    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, mask, sinks, m_prec, {1, 1}, scale, max_bias));
+                                    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, sinks, m_prec, {1, 1}, scale, max_bias));
+
+                                    if (ne0 <= 32 && ne1 <= 32) {
+                                        test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0,   ne1,   1, 3}, mask, sinks, m_prec, {3, 1}, scale, max_bias));
+                                        test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, sinks, m_prec, {2, 3}, scale, max_bias));
+                                    }
+                                }
+                            } else {
+                                /* The precision of mask here doesn't matter as boolean mask is false */
+                                test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, mask, sinks, GGML_TYPE_F32, {1, 1}, scale, max_bias));
+                                test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, sinks, GGML_TYPE_F32, {1, 1}, scale, max_bias));
+                            }
+                        }
+                    }
+                }
+            }
+            // inplace tests
+            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, mask, sinks, GGML_TYPE_F32, {1, 1}, 0.1f, 0.0f, true));
+            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, mask, sinks, GGML_TYPE_F16, {1, 1}, 0.1f, 0.0f, true));
+        }
+    }
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true,  true,  GGML_TYPE_F32, {1, 1}, 0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true,  false, GGML_TYPE_F16, {1, 1}, 0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, true,  GGML_TYPE_F32, {1, 1}, 0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  true,  GGML_TYPE_F32, {1, 1}, 0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  false, GGML_TYPE_F16, {1, 1}, 0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  true,  GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  true,  GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f));
+
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true,   true,  GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true,   true,  GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200000, 1, 1, 1}, false,  false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200000, 4, 1, 1}, false,  false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {643251, 3, 1, 1}, false,  false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
+
+    for (float max_bias : {0.0f, 8.0f}) {
+        for (float scale : {1.0f, 0.1f}) {
+            for (int64_t ne0 : {16, 1024}) {
+                for (int64_t ne1 : {16, 1024}) {
+                    test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, scale, max_bias));
+                    test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, scale, max_bias));
+                    test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0,   ne1,   2, 3}, scale, max_bias));
+                }
+            }
+        }
+    }
+
+    for (bool fw : {true, false}) { // fw == forward
+        bool all = true;
+
+        for (float fs : { 1.0f, 1.4245f }) {
+            for (float ef : { 0.0f, 0.7465f }) {
+                for (float af : { 1.0f, 1.4245f }) {
+                    for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+                        for (bool ff : {false, true}) { // freq_factors
+                            for (float v : { 0, 1 }) {
+                                test_cases.emplace_back(new test_rope(type, {128,  32, 2, 1}, 128, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw)); // llama 7B
+
+                                if (all) {
+                                    test_cases.emplace_back(new test_rope(type, {128,  40, 2, 1}, 128, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw)); // llama 13B
+                                    test_cases.emplace_back(new test_rope(type, {128,  52, 2, 1}, 128, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw)); // llama 30B
+                                    test_cases.emplace_back(new test_rope(type, {128,  64, 2, 1}, 128, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw)); // llama 65B
+                                    test_cases.emplace_back(new test_rope(type, {16, 16, 8192, 1}, 16, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw));
+                                }
+
+                                if (all) {
+                                    test_cases.emplace_back(new test_rope(type, { 64,   1, 2, 1},  64, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
+                                    test_cases.emplace_back(new test_rope(type, { 64,  71, 2, 1},  64, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
+                                    test_cases.emplace_back(new test_rope(type, { 64,   8, 2, 1},  64, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
+
+                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  20, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw));
+                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  32, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw));
+                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 4, 1},  32, GGML_ROPE_TYPE_NORMAL, 512, fs, ef, af, ff, v, fw));
+
+                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  20, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (stablelm)
+                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  32, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (phi-2)
+                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 4, 1},  32, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (phi-2)
+                                    test_cases.emplace_back(new test_rope(type, { 16, 16, 8192, 1},  16, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw));
+                                }
+
+                                if (all) {
+                                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B)
+                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 7B)
+                                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1},  20, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw));
+                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1},  32, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw));
+                                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B)
+                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,imrope (qwen3vl 7B)
+                                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1},  20, GGML_ROPE_TYPE_IMROPE,  512, fs, ef, af, ff, v, fw));
+                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1},  32, GGML_ROPE_TYPE_IMROPE,  512, fs, ef, af, ff, v, fw));
+                                    test_cases.emplace_back(new test_rope(type, { 80,  16, 2, 1},  80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT)
+                                    test_cases.emplace_back(new test_rope(type, {128,  16, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen3vl)
+                                    test_cases.emplace_back(new test_rope(type, {16, 16, 8192, 1}, 16, GGML_ROPE_TYPE_IMROPE, 512, fs, ef, af, ff, v, fw));
+                                }
+
+                                test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1},  64, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
+                            }
+                        }
+
+                        all = false;
+                    }
+                }
+            }
+        }
+    }
+
+    // single inplace test per type/mode/ff
+    for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+        for (int mode : {GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_IMROPE, GGML_ROPE_TYPE_VISION}) {
+            for (bool ff : {false, true}) {
+                test_cases.emplace_back(new test_rope(type, {128,  32, 2, 1}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 0, true, true));
+                test_cases.emplace_back(new test_rope(type, {128,  32, 2, 1}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 1, true, true));
+                test_cases.emplace_back(new test_rope(type, {128,  32, 2, 3}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 1, true, true));
+            }
+        }
+    }
+
+    for (int v : { 0, 1, 2, 3 }) {
+        for (int dim : { 0, 1, 2, 3, }) {
+            test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
+            test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
+        }
+    }
+
+    for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
+        for (uint32_t i = 4; i <= 1024*1024; i *= 2) {
+            test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {i-1, 1, 1, 1}));
+            test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {i, 1, 1, 1}));
+        }
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1023, 2, 1, 3}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1024, 2, 1, 3}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1025, 2, 1, 3}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2047, 2, 1, 3}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2048, 2, 1, 3}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2049, 2, 1, 3}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2, 8, 8192, 1}, order)); // bailingmoe2 (group selection)
+    }
+
+    for (int n = 1; n < 5; ++n) {
+        for (int k = 1; k <= n; ++k) {
+            test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {n, 2, 1, 3}, k, true));
+        }
+    }
+    for (int i = 0; i < 20; ++i) {
+        for (int k : {1, 2, 3, 7, 15, 100, 500, 1023, 9999}) {
+            if (k <= 1<<i) {
+                test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {(1<<i), 1, 1, 1}, k));
+                test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {(1<<i) + 11, 1, 2, 1}, k));
+                test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {(1<<i) + 11, 1, 2, 1}, k, true));
+            }
+        }
+    }
+    for (int k : {1, 2, 3, 7, 15}) {
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {16, 10, 10, 10}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {60, 10, 10, 10}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {1023, 2, 1, 3}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {1024, 2, 1, 3}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {1025, 2, 1, 3}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {16384, 1, 1, 1}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {2047, 2, 1, 3}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {2048, 2, 1, 3}, k));
+        test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {2049, 2, 1, 3}, k));
+    }
+
+    // exhaustive top_k tests
+    //for (int i = 1; i < 9999; ++i) {
+    //    test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {i, 2, 1, 3}, rand() % i + 1));
+    //}
+
+    for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC, ggml_scale_mode(GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)}) {
+        test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode));
+        test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true));
+        test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {2, 5,  7, 11}, {5, 7, 11, 13}, mode));
+        test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {5, 7, 11, 13}, {2, 5,  7, 11}, mode));
+    }
+    for (ggml_scale_mode mode : {GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC}) {
+        test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, (ggml_scale_mode)(mode | GGML_SCALE_FLAG_ALIGN_CORNERS)));
+        test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {1, 4, 3, 2}, {2, 8, 3, 2}, (ggml_scale_mode)(mode | GGML_SCALE_FLAG_ALIGN_CORNERS)));
+        test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {4, 1, 3, 2}, {1, 1, 3, 2}, (ggml_scale_mode)(mode | GGML_SCALE_FLAG_ALIGN_CORNERS)));
+    }
+
+    test_cases.emplace_back(new test_sum());
+    test_cases.emplace_back(new test_sum_rows());
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 2, 1, 3}));  // row-contiguous but non-contiguous
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 3, 2, 1}));
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, {11, 5, 6, 3}, {0, 1, 3, 2}));
+    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, true, false));
+    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, false, true));
+    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, true, true));
+    test_cases.emplace_back(new test_mean());
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1, 1, 1 }));
+    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1, 1, 1 }));
+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 1, 1, 1 }));
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
+    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 }));
+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 }, { 1, 0, 2, 3 })); // sum dst not-contiguous
+    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 256, 1, 1 }));
+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 256, 1, 1 }));
+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 32769, 1, 1, 1 }));
+    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
+    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
+    test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {64, 64, 320, 1}));
+    test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {9, 9, 1280, 1}));
+    test_cases.emplace_back(new test_acc());
+    test_cases.emplace_back(new test_pad());
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular
+    test_cases.emplace_back(new test_pad_ext());
+    test_cases.emplace_back(new test_pad_reflect_1d());
+    test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
+    test_cases.emplace_back(new test_roll());
+    test_cases.emplace_back(new test_arange());
+    test_cases.emplace_back(new test_arange(GGML_TYPE_F32, 0.0f, 1048576.0f, 1.0f));
+    test_cases.emplace_back(new test_timestep_embedding());
+    test_cases.emplace_back(new test_leaky_relu());
+
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 10, 5, 4, 3 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 127, 5, 4, 3 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 128, 5, 4, 3 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 128, 128, 4, 4 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 255, 5, 4, 3 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 256, 5, 4, 3 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 511, 5, 4, 3 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 512, 5, 4, 3 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 1023, 5, 4, 3 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 1024, 5, 4, 3 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 2047, 5, 4, 3 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 2048, 5, 4, 3 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 201*1204, 1, 1, 1 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 312*1205, 1, 1, 1 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 20481, 4, 1, 1 }));
+
+    test_cases.emplace_back(new test_xielu());
+
+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER));
+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER_DIAG));
+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER));
+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG));
+
+    test_cases.emplace_back(new test_fill(0.0f));
+    test_cases.emplace_back(new test_fill(2.0f, GGML_TYPE_F32, { 303, 207, 11, 3 }));
+    test_cases.emplace_back(new test_fill(-152.0f, GGML_TYPE_F32, { 800, 600, 4, 4 }));
+    test_cases.emplace_back(new test_fill(3.5f, GGML_TYPE_F32, { 2048, 512, 2, 2 }));
+
+    test_cases.emplace_back(new test_diag());
+    test_cases.emplace_back(new test_diag(GGML_TYPE_F32, { 79, 1, 19, 13 }));
+    test_cases.emplace_back(new test_diag(GGML_TYPE_F32, { 256, 1, 8, 16 }));
+
+    test_cases.emplace_back(new test_solve_tri());
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 11, 11, 1, 1 }, { 5, 11, 1, 1 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 17, 17, 2, 4 }, { 9, 17, 2, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 }));
+
+    for (int tfrm : {0, 1, 2}) {
+        for (bool circular : {false, true}) {
+            test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {512, 512, 1, 1}, 0, 1, 0, 1, 0, 0, 0, 0, tfrm, circular));
+            test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {11, 22, 33, 44}, 1, 2, 3, 4, 5, 6, 7, 8, tfrm, circular));
+        }
+    }
+
+    for (int hsk : { 40, 64, 72, 80, 96, 128, 192, 256, 576 }) {
+        for (int hsv : { 40, 64, 72, 80, 96, 128, 192, 256, 512 }) {
+            if (hsk != 192 && hsk != 576 && hsk != hsv) continue;
+            if (hsk == 192 && (hsv != 128 && hsv != 192)) continue;
+            if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA
+
+            for (bool mask : { true, false } ) {
+                for (bool sinks : { true, false } ) {
+                    for (float max_bias : { 0.0f, 8.0f }) {
+                        if (!mask && max_bias > 0.0f) continue;
+                        for (float logit_softcap : {0.0f, 10.0f}) {
+                            if (hsk != 128 && logit_softcap != 0.0f) continue;
+                            for (int nh : { 1, 4 }) {
+                                if (nh == 1 && hsk != 576) continue; // GLM 4.7 Flash
+                                for (int nr3 : { 1, 3, }) {
+                                    if (hsk > 64 && nr3 > 1) continue; // skip broadcast for large head sizes
+                                    for (int nr2 : { 1, 4, 12, 20 }) {
+                                        if (nr2 == 12 && hsk != 128) continue;
+                                        if (nr2 == 20 && (nh != 1 || hsk != 576)) continue;
+                                        //for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) {
+                                        for (int kv : { 113, 512, 1024, }) {
+                                            if (nr2 != 1 && kv != 512) continue;
+                                            for (int nb : { 1, 3, 32, 35, }) {
+                                                for (ggml_prec prec : {GGML_PREC_F32, GGML_PREC_DEFAULT}) {
+                                                    if (hsk != 128 && prec == GGML_PREC_DEFAULT) continue;
+                                                    for (ggml_type type_KV : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
+                                                        if (type_KV != GGML_TYPE_F16 && hsk != 64 && hsk != 72) continue;
+                                                        test_cases.emplace_back(new test_flash_attn_ext(
+                                                                    hsk, hsv, nh, {nr2, nr3}, kv, nb, mask, sinks, max_bias, logit_softcap, prec, type_KV));
+                                                        // run fewer test cases permuted
+                                                        if (mask == true && max_bias == 0.0f && logit_softcap == 0 && kv == 512) {
+                                                            test_cases.emplace_back(new test_flash_attn_ext(
+                                                                        hsk, hsv, nh, {nr2, nr3}, kv, nb, mask, sinks, max_bias, logit_softcap, prec, type_KV, {0, 2, 1, 3}));
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    test_cases.emplace_back(new test_cross_entropy_loss     (GGML_TYPE_F32, {   10, 5, 4, 3}));
+    test_cases.emplace_back(new test_cross_entropy_loss     (GGML_TYPE_F32, {30000, 1, 1, 1}));
+    test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, {   10, 5, 4, 3}));
+    test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, {30000, 1, 1, 1}));
+
+    test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
+    test_cases.emplace_back(new test_opt_step_sgd(GGML_TYPE_F32, {10, 5, 4, 3}));
+
+    for (ggml_type type : base_types) {
+        for (bool with_gate : {false, true}) {
+            for (bool use_id : {false, true}) {
+                for (bool b : {false, true}) {
+                    if (!use_id && b) {
+                        continue;
+                    }
+                    for (bool with_bias : {false, true}) {
+                        if (!with_gate && !with_bias) {
+                            continue;
+                        }
+                        for (ggml_glu_op glu_op : {GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU}) {
+                            if (!with_bias && glu_op == GGML_GLU_OP_SWIGLU_OAI) {
+                                continue;
+                            }
+                            if (!with_gate && glu_op != GGML_GLU_OP_SWIGLU) {
+                                continue;
+                            }
+                            test_cases.emplace_back(new test_mul_mat_vec_fusion(type, glu_op, 1, 32, 256,
+                                use_id, 16, 8, b, with_bias, with_gate));
+                            test_cases.emplace_back(new test_mul_mat_vec_fusion(type, glu_op, 1, 32, 256,
+                                use_id, 16, 8, b, with_bias, with_gate, {1, 1}));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (auto gate : {GATING_FUNC_SOFTMAX, GATING_FUNC_SIGMOID, GATING_FUNC_SOFTMAX_WEIGHT}) {
+        for (bool with_norm : {false, true}) {
+            for (bool bias_probs : {false, true}) {
+                for (float scale_w : {0.0f, 2.0f}) {
+                    test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({31, 22, 1, 1}, 8, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({40, 22, 1, 1}, 8, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({71, 22, 1, 1}, 8, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({129, 1, 1, 1}, 128, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({160, 4, 1, 1}, 160, with_norm, bias_probs, gate, scale_w));
+                }
+            }
+        }
+    }
+
+#if 0
+    // these tests are disabled to save execution time, sbut they can be handy for debugging
+    test_cases.emplace_back(new test_llama(2, true));
+    test_cases.emplace_back(new test_llama(1));
+    test_cases.emplace_back(new test_llama(2));
+    test_cases.emplace_back(new test_falcon(1));
+    test_cases.emplace_back(new test_falcon(2));
+#endif
+
+    return test_cases;
+}
+#ifdef _MSC_VER
+#pragma optimize("", on)
+#endif
+
+// Test cases for performance evaluation: should be representative of real-world use cases
+static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
+    std::vector<std::unique_ptr<test_case>> test_cases;
+
+    // Conv2d: K=CRS=NPQ=4096 matmul performance
+    uint32_t                        iwh_idx  = 0;
+    uint32_t                        kwh_idx  = 1;
+    uint32_t                        Cout_idx = 2;
+    uint32_t                        Cin_idx  = 3;
+    uint32_t                        B_idx    = 4;
+    std::vector<std::array<int, 5>> cases    = {
+  //{IWH, KWH, Cout, Cin, B}
+  // K=CRS=NPQ=4096 conv2d matmul performance
+        {19,   4, 4096, 256, 16},
+ // K=128, CRS=128, NPQ=4096
+        { 19,  4, 128,  8,   16},
+ // K=130, CRS=128, NPQ=4096
+        { 19,  4, 130,  8,   16},
+ // Edge case: K x CRS is small
+        { 19,  2, 4,    4,   16},
+ // A ConvNet's first layer
+        { 224, 3, 8,    3,   1 },
+ // A ConvNet's first layer with 2x2 convolution, and 1 channel
+        { 224, 2, 8,    1,   1 },
+ // A ConvNet's first layer with 2x2 convolution, and 1 channel, several images in the batch
+        { 224, 2, 8,    1,   8 },
+ // A middle layer of a ConvNet
+        { 58,  3, 64,   32,  1 },
+ // A middle layer of a ConvNet, several images in the batch
+        { 58,  3, 64,   32,  8 },
+ // A deep layer of a ConvNet, several images in the batch
+        { 16,  3, 512,  128, 8 },
+ // High resolution output (large NPQ)
+        {1536, 3, 64,   32,  1 },
+    };
+
+    for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+        for (auto act_case : cases) {
+            // Direct CONV_2D
+            test_cases.emplace_back(new test_conv_2d(
+                { act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
+                { act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] },
+                kernel_type, 1, 1, 0, 0, 1, 1, false));
+        }
+    }
+
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
+
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F16,  {512, 3072, 1, 1}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F32,  {8192, 512, 2, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F32,  {3072, 512, 2, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_Q4_0, {8192, 512, 2, 1}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32,  {8192, 512, 2, 1}));
+
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+
+
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
+
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
+    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1}));
+
+    test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {512, 34, 2, 1}));
+    test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 80, 1, 1}));
+    test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 80, 4, 1}));
+    test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 1, 1}));
+    test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
+
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
+
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
+    // qwen3next with CHUNK_SIZE 64
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
+    // qwen3next with CHUNK_SIZE 128
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 }));
+
+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));
+
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 128, 128, 4, 4 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 2048, 16, 5, 4 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 20000, 10, 4, 1 }));
+
+    for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
+        for (ggml_type type_a : all_types) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1,  1}, {1, 1}));
+            }
+        }
+    }
+
+    // qwen3-30b-a3b
+    for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
+        for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 128, 8, false, 768, bs, 2048));
+                test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 128, 8, false, 768, bs, 2048, 1));
+            }
+        }
+    }
+
+    for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
+        for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 1792, bs, 2048));
+                test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 32, 4, false, 1792, bs, 2048, 1));
+            }
+        }
+    }
+
+
+    // gpt-oss-20b
+    for (int bs : {1, 4, 8, 512}) {
+        for (ggml_type type_a : {GGML_TYPE_MXFP4}) {
+            for (ggml_type type_b : {GGML_TYPE_F32}) {
+                test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 2880, bs, 2880));
+                test_cases.emplace_back(new test_mul_mat_id_fusion(type_a, type_b, 32, 4, false, 2880, bs, 2880, 1));
+            }
+        }
+    }
+
+    for (int K : {3, 5}) {
+        for (int IC : {256, 2560}) {
+            for (int IW_IH : {32, 64, 256}) {
+                if (IC == 2560 && IW_IH == 256) {
+                    // too big
+                    continue;
+                }
+                test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {IW_IH, IW_IH, IC, 1}, {K, K, IC, 1}, 1, 1, 1, 1, 1, 1, true));
+            }
+        }
+    }
+
+    // Qwen3-VL-8B https://github.com/ggml-org/llama.cpp/issues/17012
+    test_cases.emplace_back(new test_flash_attn_ext(72, 72, 16, {1, 1}, 5776, 5776, false, false, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
+
+    test_cases.emplace_back(new test_flash_attn_ext(64, 64, 8, {8, 1}, 7680, 1, true, false, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
+    test_cases.emplace_back(new test_flash_attn_ext(64, 64, 8, {8, 1}, 7680, 4, true, false, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
+
+    for (int kv : { 4096, 8192, 16384, }) {
+        for (int hs : { 64, 128, }) {
+            for (int nr : { 1, 4, }) {
+                test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, {nr, 1}, kv, 1, true, false, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
+            }
+        }
+    }
+
+    for (int col : {8192, 16384, 32768, 65536, 131072, 262144, 524288}) {
+        for (int rows : {1, 4, 16}){
+            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {col, rows, 1, 1}, false,  false,  GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
+        }
+    }
+
+    test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, false));
+    test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true));
+
+    test_cases.emplace_back(new test_conv_transpose_2d({256, 256, 256, 1}, {3, 3, 16, 256}, 1));
+    test_cases.emplace_back(new test_conv_transpose_2d({16, 16, 16, 1}, {3, 3, 8, 16}, 1));
+    test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
+
+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));
+
+
+    for (int n_token : {1, 512}) {
+        test_cases.emplace_back(new test_add_id(GGML_TYPE_F32, GGML_TYPE_F32, 2880, 128, 4, n_token));
+        test_cases.emplace_back(new test_add_id(GGML_TYPE_F32, GGML_TYPE_F32, 2880, 32, 4, n_token));
+    }
+
+    for (bool fw : {true, false}) { // fw == forward
+        for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+            for (bool ff : {false, true}) { // freq_factors
+                for (float v : { 0, 1 }) {
+                    test_cases.emplace_back(new test_rope(type, {128,  32, 512, 1}, 128, GGML_ROPE_TYPE_NORMAL, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // llama 7B
+                    test_cases.emplace_back(new test_rope(type, {128,  64, 512, 1}, 128, GGML_ROPE_TYPE_NORMAL, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // llama 65B
+                    test_cases.emplace_back(new test_rope(type, { 80,  32, 512, 1},  20, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (stablelm)
+                    test_cases.emplace_back(new test_rope(type, { 64,   8, 512, 1},  64, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (falcon 40B)
+                    test_cases.emplace_back(new test_rope(type, {128,  12, 512, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B)
+                    test_cases.emplace_back(new test_rope(type, {128,  12, 512, 1}, 128, GGML_ROPE_TYPE_IMROPE,  512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B)
+                    test_cases.emplace_back(new test_rope(type, { 80,  16, 2, 1},  80, GGML_ROPE_TYPE_VISION, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT)
+                }
+            }
+        }
+    }
+
+    std::vector<std::array<int64_t, 4>> reduce_rows_cases = {
+        { 8192, 1,    1, 1 },
+        { 8192, 8192, 1, 1 },
+        { 128,  8192, 1, 1 },
+    };
+
+    for (auto it: reduce_rows_cases){
+        test_cases.emplace_back(new test_mean(GGML_TYPE_F32, it));
+        test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, it));
+        test_cases.emplace_back(new test_sum(GGML_TYPE_F32, it));
+    }
+
+    test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {65000,  16, 1, 1}));
+    test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {200000, 1,  1, 1}));
+    test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {200000, 16, 1, 1}));
+
+    test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {2, 1, 1, 1}, 1));
+    for (auto k : {1, 10, 40, 400}) {
+        for (auto nrows : {1, 16}) {
+            for (auto cols : {k, 1000, 65000, 200000}) {
+                test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {cols, nrows, 1, 1}, k));
+            }
+        }
+    }
+
+    for (auto nrows : {1, 4, 8, 16}) {
+        for (auto cols : {128, 1024, 4096, 8192, 16384, 32768, 65536, 131072, 200000, 2000000}) {
+            test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, {cols, nrows, 1, 1}));
+        }
+    }
+
+    // Examples from granite-4.0-h-1b/ggml-model-Q8_0.gguf
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {515, 3328, 1, 1}, {4, 3328, 1, 1})); // prefill
+    test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4,   3328, 1, 1}, {4, 3328, 1, 1})); // generate
+    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 48, 1, 512, 1)); // prefill
+    test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 48, 1, 1,   1)); // generate
+
+    return test_cases;
+}
+
+static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter,
+                         printer * output_printer) {
+    auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
+        if (params_filter == nullptr) {
+            return;
+        }
+
+        std::regex params_filter_regex(params_filter);
+
+        for (auto it = test_cases.begin(); it != test_cases.end();) {
+            if (!std::regex_search((*it)->vars(), params_filter_regex)) {
+                it = test_cases.erase(it);
+                continue;
+            }
+
+            it++;
+        }
+    };
+
+    if (mode == MODE_TEST) {
+        auto test_cases = make_test_cases_eval();
+        filter_test_cases(test_cases, params_filter);
+        ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
+        if (backend_cpu == NULL) {
+            test_operation_info info("", "", "CPU");
+            info.set_error("backend", "Failed to initialize CPU backend");
+            output_printer->print_operation(info);
+            return false;
+        }
+        // Use reference implementation on the CPU backend for comparison
+        using ggml_backend_cpu_set_use_ref_t = void (*)(ggml_backend_t, bool);
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
+        auto * set_use_ref = (ggml_backend_cpu_set_use_ref_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_use_ref");
+        if (set_use_ref) {
+            set_use_ref(backend_cpu, true);
+        }
+
+        size_t n_ok = 0;
+        size_t                   tests_run = 0;
+        std::vector<std::string> failed_tests;
+        for (auto & test : test_cases) {
+            test_status_t status = test->eval(backend, backend_cpu, op_names_filter, output_printer);
+            if (status == test_status_t::SKIPPED || status == test_status_t::NOT_SUPPORTED) {
+                continue;
+            }
+            tests_run++;
+            if (status == test_status_t::OK) {
+                n_ok++;
+            } else if (status == test_status_t::FAIL) {
+                failed_tests.push_back(test->current_op_name + "(" + test->vars() + ")");
+            }
+        }
+        output_printer->print_summary(test_summary_info(n_ok, tests_run, false));
+        output_printer->print_failed_tests(failed_tests);
+
+        ggml_backend_free(backend_cpu);
+
+        return n_ok == tests_run;
+    }
+
+    if (mode == MODE_GRAD) {
+        auto test_cases = make_test_cases_eval();
+        filter_test_cases(test_cases, params_filter);
+        size_t n_ok = 0;
+        for (auto & test : test_cases) {
+            if (test->eval_grad(backend, op_names_filter, output_printer)) {
+                n_ok++;
+            }
+        }
+        output_printer->print_summary(test_summary_info(n_ok, test_cases.size(), false));
+
+        return n_ok == test_cases.size();
+    }
+
+    if (mode == MODE_PERF) {
+        auto test_cases = make_test_cases_perf();
+        filter_test_cases(test_cases, params_filter);
+        for (auto & test : test_cases) {
+            test->eval_perf(backend, op_names_filter, output_printer);
+        }
+        return true;
+    }
+
+    if (mode == MODE_SUPPORT) {
+        auto test_cases = make_test_cases_eval();
+        filter_test_cases(test_cases, params_filter);
+
+        // Filter out fusion cases
+        test_cases.erase(
+            std::remove_if(test_cases.begin(), test_cases.end(), [](const std::unique_ptr<test_case> & tc) {
+                return tc->run_whole_graph();
+            }),
+            test_cases.end()
+        );
+
+        for (auto & test : test_cases) {
+            test->eval_support(backend, op_names_filter, output_printer);
+        }
+        return true;
+    }
+
+    GGML_ABORT("fatal error");
+}
+
+static void list_all_ops() {
+    printf("GGML operations:\n");
+    std::set<std::string> all_ops;
+
+    for (int i = 1; i < GGML_OP_COUNT; i++) {
+        all_ops.insert(ggml_op_name((enum ggml_op)i));
+    }
+    for (int i = 0; i < GGML_UNARY_OP_COUNT; i++) {
+        all_ops.insert(ggml_unary_op_name((enum ggml_unary_op)i));
+    }
+    for (int i = 0; i < GGML_GLU_OP_COUNT; i++) {
+        all_ops.insert(ggml_glu_op_name((enum ggml_glu_op)i));
+    }
+    for (const auto & op : all_ops) {
+        printf("  %s\n", op.c_str());
+    }
+    printf("\nTotal: %zu operations\n", all_ops.size());
+}
+
+static void show_test_coverage() {
+    std::set<std::string> all_ops;
+    for (int i = 1; i < GGML_OP_COUNT; i++) {
+        auto op = (enum ggml_op)i;
+        if (op == GGML_OP_VIEW      ||
+            op == GGML_OP_RESHAPE   ||
+            op == GGML_OP_PERMUTE   ||
+            op == GGML_OP_TRANSPOSE ||
+            op == GGML_OP_CONT      ||
+            op == GGML_OP_GLU       ||
+            op == GGML_OP_UNARY) {
+            continue;
+        }
+        all_ops.insert(ggml_op_name(op));
+    }
+    for (int i = 0; i < GGML_UNARY_OP_COUNT; i++) {
+        all_ops.insert(ggml_unary_op_name((enum ggml_unary_op)i));
+    }
+    for (int i = 0; i < GGML_GLU_OP_COUNT; i++) {
+        all_ops.insert(ggml_glu_op_name((enum ggml_glu_op)i));
+    }
+    auto test_cases = make_test_cases_eval();
+    // Filter out fusion cases
+    test_cases.erase(
+        std::remove_if(test_cases.begin(), test_cases.end(), [](const std::unique_ptr<test_case> & tc) {
+            return tc->run_whole_graph();
+        }),
+        test_cases.end()
+    );
+
+    std::set<std::string> tested_ops;
+
+    ggml_init_params params = {
+        /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
+        /* .mem_base = */ NULL,
+        /* .no_alloc = */ true,
+    };
+
+    for (auto & test_case : test_cases) {
+        ggml_context * ctx = ggml_init(params);
+        if (ctx) {
+            test_case->mode = MODE_TEST;
+            ggml_tensor * out = test_case->build_graph(ctx);
+            if (out && out->op != GGML_OP_NONE) {
+                if (out->op == GGML_OP_UNARY) {
+                    tested_ops.insert(ggml_unary_op_name(ggml_get_unary_op(out)));
+                } else if (out->op == GGML_OP_GLU) {
+                    tested_ops.insert(ggml_glu_op_name(ggml_get_glu_op(out)));
+                } else {
+                    tested_ops.insert(ggml_op_name(out->op));
+                }
+            }
+            ggml_free(ctx);
+        }
+    }
+    std::set<std::string> covered_ops;
+    std::set<std::string> uncovered_ops;
+    for (const auto & op : all_ops) {
+        if (tested_ops.count(op) > 0) {
+            covered_ops.insert(op);
+        } else {
+            uncovered_ops.insert(op);
+        }
+    }
+
+    printf("Operations covered by tests (%zu):\n", covered_ops.size());
+    for (const auto & op : covered_ops) {
+        printf("  ✓ %s\n", op.c_str());
+    }
+    printf("\nOperations without tests (%zu):\n", uncovered_ops.size());
+    for (const auto & op : uncovered_ops) {
+        printf("  ✗ %s\n", op.c_str());
+    }
+
+    printf("\nCoverage Summary:\n");
+    printf("  Total operations: %zu\n", all_ops.size());
+    printf("  Tested operations: %zu\n", covered_ops.size());
+    printf("  Untested operations: %zu\n", uncovered_ops.size());
+    printf("  Coverage: %.1f%%\n", (double)covered_ops.size() / all_ops.size() * 100.0);
+}
+
+static void usage(char ** argv) {
+    printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops] [--show-coverage]\n", argv[0]);
+    printf("    valid modes:\n");
+    printf("      - test (default, compare with CPU backend for correctness)\n");
+    printf("      - grad (compare gradients from backpropagation with method of finite differences)\n");
+    printf("      - perf (performance evaluation)\n");
+    printf("      - support (probe backend operation support)\n");
+    printf("    op names for -o are as given by ggml_op_desc() (e.g. ADD, MUL_MAT, etc),\n");
+    printf("        optionally including the full test case string (e.g. \"ADD(type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1)\")\n");
+    printf("    --output specifies output format (default: console, options: console, sql, csv)\n");
+    printf("    --list-ops lists all available GGML operations\n");
+    printf("    --show-coverage shows test coverage\n");
+}
+
+int main(int argc, char ** argv) {
+    test_mode mode = MODE_TEST;
+    output_formats output_format = CONSOLE;
+    const char * op_names_filter = nullptr;
+    const char * backend_filter = nullptr;
+    const char * params_filter = nullptr;
+
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "test") == 0) {
+            mode = MODE_TEST;
+        } else if (strcmp(argv[i], "perf") == 0) {
+            mode = MODE_PERF;
+        } else if (strcmp(argv[i], "grad") == 0) {
+            mode = MODE_GRAD;
+        } else if (strcmp(argv[i], "support") == 0) {
+            mode = MODE_SUPPORT;
+        } else if (strcmp(argv[i], "-o") == 0) {
+            if (i + 1 < argc) {
+                op_names_filter = argv[++i];
+            } else {
+                usage(argv);
+                return 1;
+            }
+        } else if (strcmp(argv[i], "-b") == 0) {
+            if (i + 1 < argc) {
+                backend_filter = argv[++i];
+            } else {
+                usage(argv);
+                return 1;
+            }
+        } else if (strcmp(argv[i], "-p") == 0) {
+            if (i + 1 < argc) {
+                params_filter = argv[++i];
+            } else {
+                usage(argv);
+                return 1;
+            }
+        } else if (strcmp(argv[i], "--output") == 0) {
+            if (i + 1 < argc) {
+                if (!output_format_from_str(argv[++i], output_format)) {
+                    usage(argv);
+                    return 1;
+                }
+            } else {
+                usage(argv);
+                return 1;
+            }
+        } else if (strcmp(argv[i], "--list-ops") == 0) {
+            list_all_ops();
+            return 0;
+        } else if (strcmp(argv[i], "--show-coverage") == 0) {
+            show_test_coverage();
+            return 0;
+        } else {
+            usage(argv);
+            return 1;
+        }
+    }
+
+    // load and enumerate backends
+    ggml_backend_load_all();
+
+    // Create printer for output format
+    std::unique_ptr<printer> output_printer = create_printer(output_format);
+    if (output_printer) {
+        output_printer->print_header();
+    }
+
+    output_printer->print_testing_start(testing_start_info(ggml_backend_dev_count()));
+
+    size_t n_ok = 0;
+
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+
+        if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_dev_name(dev)) != 0) {
+            output_printer->print_backend_init(
+                backend_init_info(i, ggml_backend_dev_count(), ggml_backend_dev_name(dev), true, "Skipping"));
+            n_ok++;
+            continue;
+        }
+
+        if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
+            output_printer->print_backend_init(backend_init_info(
+                i, ggml_backend_dev_count(), ggml_backend_dev_name(dev), true, "Skipping CPU backend"));
+            n_ok++;
+            continue;
+        }
+
+        ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
+        GGML_ASSERT(backend != NULL);
+
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
+            // TODO: better value for n_threads
+            ggml_backend_set_n_threads_fn(backend, N_THREADS);
+        }
+
+        size_t free, total;  // NOLINT
+        ggml_backend_dev_memory(dev, &free, &total);
+        output_printer->print_backend_init(backend_init_info(i, ggml_backend_dev_count(), ggml_backend_dev_name(dev),
+                                                             false, "", ggml_backend_dev_description(dev),
+                                                             total / 1024 / 1024, free / 1024 / 1024, true));
+
+        bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get());
+
+        if (ok) {
+            n_ok++;
+        }
+        output_printer->print_backend_status(
+            backend_status_info(ggml_backend_name(backend), ok ? test_status_t::OK : test_status_t::FAIL));
+
+        ggml_backend_free(backend);
+    }
+
+    ggml_quantize_free();
+
+    if (output_printer) {
+        output_printer->print_footer();
+    }
+
+    output_printer->print_overall_summary(
+        overall_summary_info(n_ok, ggml_backend_dev_count(), n_ok == ggml_backend_dev_count()));
+
+    if (n_ok != ggml_backend_dev_count()) {
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-backend-sampler.cpp b/llama.cpp/tests/test-backend-sampler.cpp
new file mode 100644
index 0000000..c10bde9
--- /dev/null
+++ b/llama.cpp/tests/test-backend-sampler.cpp
@@ -0,0 +1,1165 @@
+#include "ggml.h"
+#include "llama.h"
+#include "llama-cpp.h"
+#include "get-model.h"
+#include "common.h"
+
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+struct test_args {
+    std::string model;
+    std::string test;
+    std::string device = "auto";
+};
+
+struct test_params {
+    llama_model_ptr model;
+};
+
+static llama_model_ptr load_model(const test_args & args) {
+    auto mparams = llama_model_default_params();
+
+    ggml_backend_dev_t devs[2] = { nullptr, nullptr };
+
+    if (args.device != "auto") {
+        if (args.device == "gpu") {
+            devs[0] = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
+
+            if (devs[0] == nullptr) {
+                fprintf(stderr, "Error: GPU requested but not available\n");
+                return nullptr;
+            }
+
+            mparams.n_gpu_layers = 999;
+        } else if (args.device == "cpu") {
+            devs[0] = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+
+            mparams.n_gpu_layers = 0;
+        } else {
+            fprintf(stderr, "Error: invalid device '%s'\n", args.device.c_str());
+            return nullptr;
+        }
+
+        mparams.devices = devs;
+
+        fprintf(stderr, "Using device: %s\n", ggml_backend_dev_name(devs[0]));
+    }
+
+    llama_model_ptr res;
+
+    res.reset(llama_model_load_from_file(args.model.c_str(), mparams));
+
+    if (!res) {
+        fprintf(stderr, "Warning: failed to load model '%s', skipping test\n", args.model.c_str());
+        return nullptr;
+    }
+
+    return res;
+}
+
+struct test_context {
+    llama_context_ptr ctx;
+
+    int n_vocab = 0;
+
+    const llama_vocab * vocab = nullptr;
+
+    std::unordered_map<llama_seq_id, int32_t> seq_positions;
+    std::unordered_map<llama_seq_id, int32_t> last_batch_info;
+
+    test_context(const test_params & params, std::vector<llama_sampler_seq_config> & configs, int32_t n_seq_max = -1) {
+        auto * model = params.model.get();
+
+        GGML_ASSERT(model);
+        GGML_ASSERT(!ctx);
+
+        llama_context_params cparams = llama_context_default_params();
+        cparams.n_ctx = 512;
+        cparams.n_batch = 512;
+        cparams.samplers = configs.data();
+        cparams.n_samplers = configs.size();
+
+        // If n_seq_max is not specified, calculate it from configs
+        if (n_seq_max < 0) {
+            int32_t max_seq_id = 0;
+            for (const auto & config : configs) {
+                max_seq_id = std::max(config.seq_id, max_seq_id);
+            }
+            cparams.n_seq_max = max_seq_id + 1;
+        } else {
+            cparams.n_seq_max = n_seq_max;
+        }
+
+        ctx.reset(llama_init_from_model(model, cparams));
+        if (!ctx) {
+            throw std::runtime_error("failed to create context");
+        }
+
+        llama_set_warmup(ctx.get(), false);
+
+        vocab = llama_model_get_vocab(model);
+        n_vocab = llama_vocab_n_tokens(vocab);
+    }
+
+    bool decode(const std::map<llama_seq_id, std::string> & prompts) {
+        GGML_ASSERT(ctx);
+
+        last_batch_info.clear();
+        llama_batch batch = llama_batch_init(512, 0, prompts.size());
+
+        for (const auto & [seq_id, prompt] : prompts) {
+            std::vector<llama_token> tokens;
+            tokens.push_back(llama_vocab_bos(vocab));
+
+            std::vector<llama_token> prompt_tokens(32);
+            int n_tokens = llama_tokenize(vocab, prompt.c_str(), prompt.length(),
+                                           prompt_tokens.data(), prompt_tokens.size(),
+                                           false, false);
+            if (n_tokens < 0) {
+                fprintf(stderr, "Warning: tokenization failed for seq_id %d\n", seq_id);
+                llama_batch_free(batch);
+                return false;
+            }
+
+            for (int i = 0; i < n_tokens; i++) {
+                tokens.push_back(prompt_tokens[i]);
+            }
+
+            if (seq_positions.find(seq_id) == seq_positions.end()) {
+                seq_positions[seq_id] = 0;
+            }
+
+            int32_t start_pos = seq_positions[seq_id];
+            for (size_t i = 0; i < tokens.size(); i++) {
+                common_batch_add(batch, tokens[i], start_pos + i, { seq_id }, i == tokens.size() - 1);
+            }
+
+            seq_positions[seq_id] = start_pos + tokens.size();
+        }
+
+
+        printf("Batch contents:\n");
+        printf("n_tokens: %d\n", batch.n_tokens);
+        for (int i = 0; i < batch.n_tokens; i++) {
+            printf("token[%d]: tok=%-5d, pos=%d, n_seq_id=%d, seq_ids=[", i, batch.token[i], batch.pos[i], batch.n_seq_id[i]);
+
+            for (int j = 0; j < batch.n_seq_id[i]; j++) {
+                printf("%d%s", batch.seq_id[i][j], j < batch.n_seq_id[i]-1 ? ", " : "");
+            }
+            printf("], logits=%d\n", batch.logits[i]);
+        }
+
+        if (llama_decode(ctx.get(), batch) != 0) {
+            fprintf(stderr, "Warning: llama_decode failed\n");
+            llama_batch_free(batch);
+            return false;
+        }
+
+        // Build mapping from seq id to batch token idx
+        for (int i = 0; i < batch.n_tokens; i++) {
+            if (batch.logits[i]) {
+                llama_seq_id seq_id = batch.seq_id[i][0];
+                last_batch_info[seq_id] = i;
+            }
+        }
+
+        llama_batch_free(batch);
+        return true;
+    }
+
+    int32_t idx_for_seq(llama_seq_id seq_id) {
+        auto it = last_batch_info.find(seq_id);
+        if (it == last_batch_info.end()) {
+            fprintf(stderr, "Error: no batch index found for seq_id %d\n", seq_id);
+            return -1;
+        }
+        return it->second;
+    }
+
+    void update_batch_info(const llama_batch & batch) {
+        last_batch_info.clear();
+        for (int i = 0; i < batch.n_tokens; i++) {
+            if (batch.logits[i]) {
+                llama_seq_id cur_seq = batch.seq_id[i][0];
+                last_batch_info[cur_seq] = i;
+            }
+        }
+    }
+
+    bool decode_token(llama_token token, llama_seq_id seq_id = 0) {
+        GGML_ASSERT(ctx);
+
+        llama_batch batch = llama_batch_init(1, 0, 1);
+        int32_t pos = seq_positions[seq_id];
+        common_batch_add(batch, token, pos, { seq_id }, true);
+
+        if (llama_decode(ctx.get(), batch) != 0) {
+            fprintf(stderr, "Warning: llama_decode failed for token %d in seq %d\n", token, seq_id);
+            llama_batch_free(batch);
+            return false;
+        }
+
+        update_batch_info(batch);
+
+        seq_positions[seq_id]++;
+        llama_batch_free(batch);
+
+        return true;
+    }
+
+    bool decode_tokens(const std::map<llama_seq_id, llama_token> & seq_tokens) {
+        GGML_ASSERT(ctx);
+
+        llama_batch batch = llama_batch_init(seq_tokens.size(), 0, seq_tokens.size());
+
+        for (const auto & [seq_id, token] : seq_tokens) {
+            int32_t pos = seq_positions[seq_id];
+            common_batch_add(batch, token, pos, { seq_id }, true);
+        }
+
+        if (llama_decode(ctx.get(), batch) != 0) {
+            fprintf(stderr, "Warning: llama_decode failed for batch tokens\n");
+            llama_batch_free(batch);
+            return false;
+        }
+
+        for (const auto & [seq_id, _] : seq_tokens) {
+            seq_positions[seq_id]++;
+        }
+
+        update_batch_info(batch);
+
+        llama_batch_free(batch);
+
+        return true;
+    }
+
+    std::string token_to_piece(llama_token token, bool special) const {
+        std::string piece;
+        piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
+        const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+        if (n_chars < 0) {
+            piece.resize(-n_chars);
+            int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+            GGML_ASSERT(check == -n_chars);
+        } else {
+            piece.resize(n_chars);
+        }
+
+        return piece;
+    }
+};
+
+static void test_backend_greedy_sampling(const test_params & params) {
+    const int seq_id = 0;
+
+    struct llama_sampler_chain_params backend_sampler_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_sampler_params));
+
+    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_greedy());
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
+
+    test_context test_ctx(params, backend_sampler_configs);
+
+    if (!test_ctx.decode({{seq_id, "Some"}})) {
+        GGML_ASSERT(false && "Failed to decode token");
+    }
+
+    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
+
+    llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
+    printf("greedy sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str());
+    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+
+    token = llama_get_sampled_token_ith(test_ctx.ctx.get(), -1);
+    printf("greedy sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str());
+    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+
+    for (int i = 0; i < 10; i++) {
+        int32_t loop_idx = test_ctx.idx_for_seq(seq_id);
+        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), loop_idx);
+        printf("Generation step %d: token id:%d, string: %s\n", i, token, test_ctx.token_to_piece(token, false).c_str());
+        if (!test_ctx.decode_token(token, 0)) {
+            GGML_ASSERT(false && "Failed to decode token");
+        }
+    }
+}
+
+static void test_backend_top_k_sampling(const test_params & params) {
+    const int seq_id = 0;
+    const int32_t k = 8;
+    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
+    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_top_k(k));
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
+
+    test_context test_ctx(params, backend_sampler_configs);
+
+    if (!test_ctx.decode({{seq_id, "Hello"}})) {
+        GGML_ASSERT(false && "Failed to decode token");
+    }
+
+    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
+
+    float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
+    uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
+    for (size_t i = 0; i < n_logits; ++i) {
+        printf("top_k logit[%zu] = %.6f\n", i, logits[i]);
+    }
+
+    llama_token * candidates = llama_get_sampled_candidates_ith(test_ctx.ctx.get(), batch_idx);
+    uint32_t n_candidates = llama_get_sampled_candidates_count_ith(test_ctx.ctx.get(), batch_idx);
+    for (size_t i = 0; i < n_candidates; ++i) {
+        printf("top_k candidate[%zu] = %d : %s\n", i, candidates[i],
+               test_ctx.token_to_piece(candidates[i], false).c_str());
+    }
+
+    // Sample using CPU sampler for verification that it is possible to do hybrid
+    // sampling, first top_k on the backend and then dist on the CPU.
+    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
+    GGML_ASSERT(chain->iface->backend_apply != nullptr);
+
+    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));
+    llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
+    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+
+    printf("backend top-k hybrid sampling test PASSED\n");
+}
+
+static void test_backend_temp_sampling(const test_params & params) {
+    {
+        const float temp_0 = 0.8f;
+        struct llama_sampler_chain_params backend_chain_params_0 = llama_sampler_chain_default_params();
+        llama_sampler_ptr backend_sampler_chain_0(llama_sampler_chain_init(backend_chain_params_0));
+        llama_sampler_chain_add(backend_sampler_chain_0.get(), llama_sampler_init_temp(temp_0));
+
+        const float temp_1 = 0.1f;
+        struct llama_sampler_chain_params backend_chain_params_1 = llama_sampler_chain_default_params();
+        llama_sampler_ptr backend_sampler_chain_1(llama_sampler_chain_init(backend_chain_params_1));
+        llama_sampler_chain_add(backend_sampler_chain_1.get(), llama_sampler_init_temp(temp_1));
+
+        std::vector<llama_sampler_seq_config> backend_sampler_configs = {
+            { 0, backend_sampler_chain_0.get() },
+            { 1, backend_sampler_chain_1.get() }
+        };
+
+        test_context test_ctx(params, backend_sampler_configs);
+
+        if (!test_ctx.decode({{0, "Some where over the"}, {1, "Once upon a"}})) {
+            GGML_ASSERT(false && "Failed to decode token");
+        }
+
+        // Verfify sequence 0
+        {
+            int32_t batch_idx = test_ctx.idx_for_seq(0);
+            int n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
+            GGML_ASSERT(n_logits == test_ctx.n_vocab);
+
+            // Sample from sequence 0 using CPU sampler
+            struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
+            llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
+            llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));
+
+            llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
+            const std::string token_str = test_ctx.token_to_piece(token, false);
+            printf("Sequence 0 sampled token id:%d, string: '%s'\n", token, token_str.c_str());
+            GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+        }
+
+
+        // Verfify sequence 1
+        {
+            int32_t batch_idx = test_ctx.idx_for_seq(1);
+
+            // Sample from sequence 1 using CPU sampler
+            struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
+            llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
+            llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));
+
+            llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
+            const std::string token_str = test_ctx.token_to_piece(token, false);
+            printf("Sequence 1 sampled token id:%d, string: '%s'\n", token, token_str.c_str());
+            GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+        }
+    }
+
+    // lambda to testing non-positive temperature values.
+    auto test_argmax_temp = [&](float temp) {
+        printf("\nTesting temperature = %.1f\n", temp);
+
+        int seq_id = 0;
+        struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
+        llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
+        llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_temp(temp));
+
+        std::vector<llama_sampler_seq_config> backend_sampler_configs = {
+            { seq_id, backend_sampler_chain.get() },
+        };
+
+        test_context test_ctx(params, backend_sampler_configs);
+
+        if (!test_ctx.decode({{seq_id, "Once"}})) {
+            GGML_ASSERT(false && "Failed to decode token");
+        }
+
+        int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
+
+        uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
+        GGML_ASSERT(n_logits == 1);
+    };
+
+    test_argmax_temp(0.0f);
+    test_argmax_temp(-1.0f);
+
+    printf("backend temp sampling test PASSED\n");
+}
+
+static void test_backend_temp_ext_sampling(const test_params & params) {
+    {
+        int seq_id = 0;
+        const float temp = 0.8f;
+        const float delta = 0.5f;
+        const float exponent = 1.5f;
+        struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
+        llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
+        llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_temp_ext(temp, delta, exponent));
+
+        std::vector<llama_sampler_seq_config> backend_sampler_configs = {
+            { seq_id, backend_sampler_chain.get() },
+        };
+
+        test_context test_ctx(params, backend_sampler_configs);
+
+        if (!test_ctx.decode({{seq_id, "Once upon a"}})) {
+            GGML_ASSERT(false && "Failed to decode token");
+        }
+
+        // Verify sequence 0
+        {
+            int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
+            int n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
+            GGML_ASSERT(n_logits == test_ctx.n_vocab);
+        }
+    }
+
+    // lambda to testing non-positive temp/delta/exponent values.
+    auto test_argmax_temp = [&](float temp, float delta, float exponent) {
+        printf("\nTesting temperature = %.1f, delta = %1.f, exponent = %1.f\n", temp, delta, exponent);
+
+        int seq_id = 0;
+        struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
+        llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
+        llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_temp_ext(temp, delta, exponent));
+
+        std::vector<llama_sampler_seq_config> backend_sampler_configs = {
+            { seq_id, backend_sampler_chain.get() },
+        };
+
+        test_context test_ctx(params, backend_sampler_configs);
+
+        if (!test_ctx.decode({{seq_id, "Once"}})) {
+            GGML_ASSERT(false && "Failed to decode token");
+        }
+
+        int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
+
+        uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
+
+        if (temp <= 0.0f && delta >= 0.0f) {
+            GGML_ASSERT(n_logits == 1);
+        } else {
+            GGML_ASSERT(n_logits == (uint32_t) test_ctx.n_vocab);
+        }
+    };
+
+    test_argmax_temp(0.0f,  0.3f, 1.0f); // Greedy (temp=0)
+    test_argmax_temp(-1.0f, 0.3f, 2.0f); // Greedy (temp<0)
+    test_argmax_temp(0.8f,  0.0f, 2.0f); // Temperature scaling
+
+    printf("backend temp_ext sampling test PASSED\n");
+}
+
+static void test_backend_min_p_sampling(const test_params & params) {
+    const int seq_id = 0;
+    const float p = 0.1;
+    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
+    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_min_p(p, 0));
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
+
+    test_context test_ctx(params, backend_sampler_configs);
+
+    if (!test_ctx.decode({{seq_id, "Hello"}})) {
+        GGML_ASSERT(false && "Failed to decode token");
+    }
+
+    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
+
+    float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
+    uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
+
+    // Print the logits that are above the min-p threshold
+    std::vector<float> filtered_logits;
+    for (size_t i = 0; i < n_logits; ++i) {
+        if (logits[i] > -1e9f) {
+            filtered_logits.push_back(logits[i]);
+            //printf("min_p logit[%zu] = %.6f\n", i, logits[i]);
+        }
+    }
+    GGML_ASSERT(filtered_logits.size() < (size_t) test_ctx.n_vocab);
+
+    // Sample using CPU sampler for verification to inspect they are reasonable
+    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
+    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(88));
+
+    llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
+    const std::string token_str = test_ctx.token_to_piece(token, false);
+    printf("min-p cpu sampled token id:%d, string: '%s'\n", token, token_str.c_str());
+    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+
+    // Decode and sampler 10 more tokens
+    for (int i = 0; i < 10; i++) {
+        int32_t loop_idx = test_ctx.idx_for_seq(seq_id);
+        llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), loop_idx);
+        printf("min-p gen step %d: token id :%5.d, string: %s\n", i, token, test_ctx.token_to_piece(token, false).c_str());
+        if (!test_ctx.decode_token(token, 0)) {
+            GGML_ASSERT(false && "Failed to decode token");
+        }
+    }
+
+    printf("min-p sampling test PASSED\n");
+}
+
+static void test_backend_top_p_sampling(const test_params & params) {
+    const int seq_id = 0;
+    const float p = 0.9;
+    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
+    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_top_p(p, 0));
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
+
+    test_context test_ctx(params, backend_sampler_configs);
+
+    if (!test_ctx.decode({{seq_id, "Hello"}})) {
+        return;
+    }
+
+    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
+
+    float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
+    uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
+
+    // Print the logits that are above the min-p threshold
+    std::vector<float> filtered_logits;
+    for (size_t i = 0; i < n_logits; ++i) {
+        if (logits[i] > -1e9f) {
+            filtered_logits.push_back(logits[i]);
+        }
+    }
+    GGML_ASSERT(filtered_logits.size() < (size_t) test_ctx.n_vocab);
+    GGML_ASSERT(filtered_logits.size() > 0);
+
+    // Sample using CPU sampler for verification to inspect they are reasonable
+    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
+    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(88));
+
+    llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
+    const std::string token_str = test_ctx.token_to_piece(token, false);
+    printf("top-p cpu sampled token id:%d, string: '%s'\n", token, token_str.c_str());
+    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+
+    // Decode and sampler 10 more tokens
+    for (int i = 0; i < 10; i++) {
+        int32_t loop_idx = test_ctx.idx_for_seq(seq_id);
+        llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), loop_idx);
+        printf("top-p gen step %d: token id :%5.d, string: %s\n", i, token, test_ctx.token_to_piece(token, false).c_str());
+        test_ctx.decode_token(token, 0);
+    }
+
+    printf("top-p sampling test PASSED\n");
+}
+
+static void test_backend_multi_sequence_sampling(const test_params & params) {
+    struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
+    llama_sampler_ptr sampler_chain_0(llama_sampler_chain_init(chain_params_0));
+    llama_sampler_chain_add(sampler_chain_0.get(), llama_sampler_init_greedy());
+
+    struct llama_sampler_chain_params chain_params_1 = llama_sampler_chain_default_params();
+    llama_sampler_ptr sampler_chain_1(llama_sampler_chain_init(chain_params_1));
+    llama_sampler_chain_add(sampler_chain_1.get(), llama_sampler_init_temp(0.8f));
+    llama_sampler_chain_add(sampler_chain_1.get(), llama_sampler_init_greedy());
+
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
+        { 0, sampler_chain_0.get() },
+        { 1, sampler_chain_1.get() }
+    };
+
+    test_context test_ctx(params, backend_sampler_configs);
+
+    std::map<llama_seq_id, std::string> prompts = {
+        {0, "Hello"},
+        {1, "Some"}
+    };
+
+    if (!test_ctx.decode(prompts)) {
+        GGML_ASSERT(false && "Failed to decode token");
+    }
+
+    // Verfiy sequence 0
+    {
+        int32_t batch_idx = test_ctx.idx_for_seq(0);
+        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
+        const std::string token_str = test_ctx.token_to_piece(token, false);
+        printf("Seq 0 sampled token id=%d, string='%s'\n", token, token_str.c_str());
+        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+    }
+
+    // Verify sequence 1
+    {
+        int32_t batch_idx= test_ctx.idx_for_seq(1);
+        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
+        const std::string token_str = test_ctx.token_to_piece(token, false);
+        printf("Seq 1 sampled token id=%d, string='%s'\n", token, token_str.c_str());
+        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+    }
+
+    // Generate tokens for each sequence
+    printf("\nMulti-sequence generation:\n");
+    for (int step = 0; step < 4; step++) {
+        std::map<llama_seq_id, llama_token> tokens;
+
+        for (llama_seq_id seq_id : {0, 1}) {
+            int32_t idx = test_ctx.idx_for_seq(seq_id);
+            llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), idx);
+            const std::string token_str = test_ctx.token_to_piece(token, false);
+            printf("  Seq %d, step %d: token id=%d, string='%s'\n", seq_id, step, token, token_str.c_str());
+            tokens[seq_id] = token;
+        }
+
+        // Decode all tokens in a single batch
+        if (!test_ctx.decode_tokens(tokens)) {
+            GGML_ASSERT(false && "Failed to decode token");
+        }
+    }
+
+    printf("backend multi-sequence sampling test PASSED\n");
+}
+
+static void test_backend_dist_sampling(const test_params & params) {
+    const int seq_id = 189;
+    const int32_t seed = 88;
+
+    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
+    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
+
+    test_context test_ctx(params, backend_sampler_configs);
+
+    if (!test_ctx.decode({{seq_id, "Some"}})) {
+        GGML_ASSERT(false && "Failed to decode token");
+    }
+
+    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
+    llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
+    printf("dist sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str());
+    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+    //GGML_ASSERT(llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx) == nullptr);
+
+    token = llama_get_sampled_token_ith(test_ctx.ctx.get(), -1);
+    printf("dist sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str());
+    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+
+    printf("backend dist sampling test PASSED\n");
+}
+
+static void test_backend_dist_sampling_and_cpu(const test_params & params) {
+    const int seq_id = 0;
+    const int32_t seed = 88;
+
+    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
+    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
+
+    test_context test_ctx(params, backend_sampler_configs);
+
+    if (!test_ctx.decode({{seq_id, "Some"}})) {
+        GGML_ASSERT(false && "Failed to decode token");
+    }
+
+    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
+
+    // Sample using CPU sampler
+    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
+    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));
+
+    llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
+    llama_token cpu_token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
+    printf("dist & cpu sampled id:%d, string:'%s'\n", cpu_token, test_ctx.token_to_piece(cpu_token, false).c_str());
+    GGML_ASSERT(backend_token == cpu_token);
+
+    printf("backend dist & cpu sampling test PASSED\n");
+}
+
+static void test_backend_logit_bias_sampling(const test_params & params) {
+    const auto * model = params.model.get();
+    const auto * vocab = llama_model_get_vocab(model);
+
+    const int seq_id = 0;
+
+    std::vector<llama_logit_bias> logit_bias;
+
+    // Get the token for the piece "World".
+    const std::string piece = "World";
+    std::vector<llama_token> tokens(16);
+    llama_tokenize(vocab, piece.c_str(), piece.size(), tokens.data(), tokens.size(), false, false);
+
+    llama_token bias_token = tokens[0];
+    // TODO: biasing too much here makes the Vulkan sampling fail - should be investigated further
+    //       https://github.com/ggml-org/llama.cpp/actions/runs/20894267644/job/60030252675?pr=18753#step:3:23350
+    //logit_bias.push_back({ bias_token, +100.0f });
+    logit_bias.push_back({ bias_token, +10.0f });
+
+    printf("biasing token piece '%s' -> token id %d\n", piece.c_str(), bias_token);
+
+    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
+    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_logit_bias(
+                llama_vocab_n_tokens(vocab),
+                logit_bias.size(),
+                logit_bias.data()));
+    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(88));
+
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
+        { seq_id, backend_sampler_chain.get() },
+    };
+
+    test_context test_ctx(params, backend_sampler_configs);
+
+    if (!test_ctx.decode({{seq_id, "Hello"}})) {
+        GGML_ASSERT(false && "Failed to decode token");
+    }
+
+    llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), test_ctx.idx_for_seq(seq_id));
+    printf("sampled token = %d, expected = %d\n", backend_token, bias_token);
+    GGML_ASSERT(backend_token == bias_token);
+
+    printf("backend logit bias sampling test PASSED\n");
+}
+
+// This test verifies that it is possible to have two different backend sampler,
+// one that uses the backend dist sampler, and another that uses CPU dist sampler.
+static void test_backend_mixed_sampling(const test_params & params) {
+    struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
+    llama_sampler_ptr sampler_chain_0(llama_sampler_chain_init(chain_params_0));
+    llama_sampler_chain_add(sampler_chain_0.get(), llama_sampler_init_dist(88));
+
+    int k = 40;
+    struct llama_sampler_chain_params chain_params_1 = llama_sampler_chain_default_params();
+    llama_sampler_ptr sampler_chain_1(llama_sampler_chain_init(chain_params_1));
+    llama_sampler_chain_add(sampler_chain_1.get(), llama_sampler_init_top_k(k));
+
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
+        { 0, sampler_chain_0.get() },
+        { 1, sampler_chain_1.get() }
+    };
+
+    test_context test_ctx(params, backend_sampler_configs);
+
+    std::map<llama_seq_id, std::string> prompts = {
+        {0, "Hello"},
+        {1, "Some"}
+    };
+
+    if (!test_ctx.decode(prompts)) {
+        GGML_ASSERT(false && "Failed to decode token");
+    }
+
+    // Verfiy sequence 0 that used the dist backend sampler.
+    {
+        int32_t batch_idx = test_ctx.idx_for_seq(0);
+        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
+        const std::string token_str = test_ctx.token_to_piece(token, false);
+        printf("sampled token id=%d, string='%s'\n", token, token_str.c_str());
+        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+        //GGML_ASSERT(llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx) == nullptr);
+        //GGML_ASSERT(llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx) == 0);
+    }
+
+    // Verfiy sequence 1 that used the top-k backend sampler.
+    {
+        int32_t batch_idx = test_ctx.idx_for_seq(1);
+        float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
+        GGML_ASSERT(logits != nullptr);
+        size_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
+        GGML_ASSERT(n_logits == (size_t) k);
+        GGML_ASSERT(llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx) == LLAMA_TOKEN_NULL);
+    }
+
+    printf("backend mixed sampling test PASSED\n");
+}
+
+static void test_backend_set_sampler(const test_params & params) {
+    const int seq_id = 0;
+    const int32_t seed = 88;
+
+    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
+    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
+
+    test_context test_ctx(params, backend_sampler_configs);
+
+    if (!test_ctx.decode({{seq_id, "Hello"}})) {
+        GGML_ASSERT(false && "Failed to decode token");
+    }
+
+    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
+
+    // Sample using backend sampler configured above
+    llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
+    const std::string backend_token_str = test_ctx.token_to_piece(backend_token, false);
+    printf("dist sampled token = %d, string='%s'\n", backend_token, backend_token_str.c_str());
+
+    // Now clear the backend sampler for this sequence.
+    llama_set_sampler(test_ctx.ctx.get(), seq_id, nullptr);
+    printf("Cleared backend sampler for seq_id %d\n", seq_id);
+
+    // Sample using CPU sampler
+    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
+    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));
+
+    std::map<llama_seq_id, llama_token> tokens = { { seq_id, backend_token}, };
+    if (!test_ctx.decode_tokens(tokens)) {
+        GGML_ASSERT(false && "Failed to decode token");
+    }
+
+    // Should not have any sampled token or probs after clearing the backend sampler.
+    const int32_t idx = test_ctx.idx_for_seq(seq_id);
+    GGML_ASSERT(llama_get_sampled_token_ith(test_ctx.ctx.get(), idx) == LLAMA_TOKEN_NULL);
+    GGML_ASSERT(llama_get_sampled_probs_ith(test_ctx.ctx.get(), idx) == nullptr);
+
+    // Sample the token using the CPU sampler chain.
+    llama_token token2 = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), seq_id);
+    const std::string token2_str = test_ctx.token_to_piece(token2, false);
+    printf("CPU sampled token after clearing backend sampler: id=%d, string='%s'\n", token2, token2_str.c_str());
+    std::map<llama_seq_id, llama_token> tokens2 = { { seq_id, token2}, };
+
+    // Set a new backend sampler for the sequence.
+    struct llama_sampler_chain_params new_backend_chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr new_backend_sampler_chain(llama_sampler_chain_init(new_backend_chain_params));
+    llama_sampler_chain_add(new_backend_sampler_chain.get(), llama_sampler_init_top_k(20));
+    llama_sampler_chain_add(new_backend_sampler_chain.get(), llama_sampler_init_dist(seed));
+    llama_set_sampler(test_ctx.ctx.get(), seq_id, new_backend_sampler_chain.get());
+
+    if (!test_ctx.decode_tokens(tokens2)) {
+        GGML_ASSERT(false && "Failed to decode token");
+    }
+
+    llama_token new_backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), test_ctx.idx_for_seq(seq_id));
+    const std::string new_backend_token_str = test_ctx.token_to_piece(new_backend_token, false);
+    printf("dist sampled token = %d, string='%s'\n", new_backend_token, new_backend_token_str.c_str());
+
+    printf("backend set sampler test PASSED\n");
+}
+
+static void test_backend_cpu_mixed_batch(const test_params & params) {
+    // Sequence 0 uses backend sampling
+    struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
+    llama_sampler_ptr sampler_chain_0(llama_sampler_chain_init(chain_params_0));
+    llama_sampler_chain_add(sampler_chain_0.get(), llama_sampler_init_dist(88));
+
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
+        { 0, sampler_chain_0.get() },
+    };
+
+    // We need 2 sequences: seq 0 with backend sampling, seq 1 with CPU sampling
+    test_context test_ctx(params, backend_sampler_configs, 2);
+
+    std::map<llama_seq_id, std::string> prompts = {
+        {0, "Hello"}, // Will use backend sampling
+        {1, "Some"}   // Will use CPU sampling
+    };
+
+    if (!test_ctx.decode(prompts)) {
+        GGML_ASSERT(false && "Failed to decode token");
+    }
+
+    // Verify sequence 0 (backend sampled)
+    {
+        int32_t batch_idx = test_ctx.idx_for_seq(0);
+        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
+        const std::string token_str = test_ctx.token_to_piece(token, false);
+        printf("Seq 0 (backend) sampled token id=%d, string='%s'\n", token, token_str.c_str());
+        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+    }
+
+    // Verify sequence 1 (CPU sampled)
+    {
+        int32_t batch_idx = test_ctx.idx_for_seq(1);
+
+        llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
+        GGML_ASSERT(backend_token == LLAMA_TOKEN_NULL);
+
+        struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
+        llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
+        llama_sampler_chain_add(chain.get(), llama_sampler_init_greedy());
+
+        llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
+        const std::string token_str = test_ctx.token_to_piece(token, false);
+        printf("Seq 1 (CPU) sampled token id=%d, string='%s'\n", token, token_str.c_str());
+        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+    }
+
+    // Clear/remove the backend sampler, and sample again
+    {
+        // clear the backend sampler for seq 0 so that there are no backend
+        // samplers.
+        llama_set_sampler(test_ctx.ctx.get(), 0, nullptr);
+
+        // Create a CPU sampler and verify we can sampler from it.
+        struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
+        llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
+        llama_sampler_chain_add(chain.get(), llama_sampler_init_greedy());
+
+        int32_t batch_idx = test_ctx.idx_for_seq(1);
+        llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
+        if (!test_ctx.decode_token(token, 1)) {
+            GGML_ASSERT(false && "Failed to decode token");
+        }
+    }
+
+    // Set a backend sampler so that we can verify that it can be reset
+    {
+        struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
+        llama_sampler_ptr sampler_chain(llama_sampler_chain_init(chain_params));
+        llama_sampler_chain_add(sampler_chain.get(), llama_sampler_init_dist(88));
+
+        llama_set_sampler(test_ctx.ctx.get(), 0, sampler_chain.get());
+
+        if (!test_ctx.decode_token(3834, 0)) {
+            GGML_ASSERT(false && "Failed to decode token");
+        }
+
+        int32_t batch_idx = test_ctx.idx_for_seq(0);
+        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
+        const std::string token_str = test_ctx.token_to_piece(token, false);
+        printf("re-added backend sampled token id=%d, string='%s'\n", token, token_str.c_str());
+        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+    }
+
+    printf("backend-cpu mixed batch test PASSED\n");
+}
+
+static void test_backend_max_outputs(const test_params & params) {
+    const int seq_id = 0;
+    const int32_t seed = 88;
+
+    llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
+    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
+    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
+
+    test_context test_ctx(params, backend_sampler_configs);
+
+    llama_batch batch = llama_batch_init(512, 0, 1);
+    std::string prompt = "Hello";
+
+    std::vector<llama_token> tokens;
+    tokens.push_back(llama_vocab_bos(test_ctx.vocab));
+
+    std::vector<llama_token> prompt_tokens(32);
+    int n_tokens = llama_tokenize(test_ctx.vocab, prompt.c_str(), prompt.length(),
+                                   prompt_tokens.data(), prompt_tokens.size(),
+                                   false, false);
+    for (int i = 0; i < n_tokens; i++) {
+        tokens.push_back(prompt_tokens[i]);
+    }
+
+    for (size_t i = 0; i < tokens.size(); i++) {
+        // set all tokens as output to trigger error
+        common_batch_add(batch, tokens[i], i, { seq_id }, true);
+    }
+
+    printf(">>> test_max_outputs expected error start:\n");
+    const int ret = llama_decode(test_ctx.ctx.get(), batch);
+    GGML_ASSERT(ret != 0 && "llama_decode should not succeed multiple outputs per sequence");
+    printf("<<< test_max_outputs expected error end.\n");
+    llama_batch_free(batch);
+
+    printf("backend max outputs test PASSED\n");
+}
+
+struct backend_test_case {
+    std::string name;
+    void (*fn)(const test_params &);
+    bool enabled_by_default;
+};
+
+static const backend_test_case BACKEND_TESTS[] = {
+    { "greedy",          test_backend_greedy_sampling,         true  },
+    { "logit_bias",      test_backend_logit_bias_sampling,     true  },
+    { "temp",            test_backend_temp_sampling,           true  },
+    { "temp_ext",        test_backend_temp_ext_sampling,       true  },
+    { "top_k",           test_backend_top_k_sampling,          true  },
+    { "multi_sequence",  test_backend_multi_sequence_sampling, true  },
+    { "dist",            test_backend_dist_sampling,           true  },
+    { "dist_and_cpu",    test_backend_dist_sampling_and_cpu,   true  },
+    { "set_sampler",     test_backend_set_sampler,             true  },
+    { "max_outputs",     test_backend_max_outputs,             true  },
+    { "mixed",           test_backend_mixed_sampling,          true  },
+    { "min_p",           test_backend_min_p_sampling,          true  },
+    { "cpu_mixed",       test_backend_cpu_mixed_batch,         true  },
+    { "top_p",           test_backend_top_p_sampling,          true  },
+};
+
+static test_args parse_cli(int argc, char ** argv) {
+    test_args out;
+
+    for (int i = 1; i < argc; ++i) {
+        const char * arg = argv[i];
+
+        if (std::strcmp(arg, "--test") == 0) {
+            if (i + 1 >= argc) {
+                fprintf(stderr, "--test expects a value\n");
+                exit(EXIT_FAILURE);
+            }
+            out.test = argv[++i];
+            continue;
+        }
+        if (std::strncmp(arg, "--test=", 7) == 0) {
+            out.test = arg + 7;
+            continue;
+        }
+        if (std::strcmp(arg, "--model") == 0) {
+            if (i + 1 >= argc) {
+                fprintf(stderr, "--model expects a value\n");
+                exit(EXIT_FAILURE);
+            }
+            out.model = argv[++i];
+            continue;
+        }
+        if (std::strncmp(arg, "--model=", 8) == 0) {
+            out.model = arg + 8;
+            continue;
+        }
+        if (std::strcmp(arg, "--device") == 0) {
+            if (i + 1 >= argc) {
+                fprintf(stderr, "--device expects a value (cpu or gpu)\n");
+                exit(EXIT_FAILURE);
+            }
+            out.device = argv[++i];
+            continue;
+        }
+        if (std::strncmp(arg, "--device=", 9) == 0) {
+            out.device = arg + 9;
+            continue;
+        }
+        if (out.model.empty()) {
+            out.model = arg;
+            continue;
+        }
+
+        fprintf(stderr, "Unexpected argument: %s\n", arg);
+        exit(EXIT_FAILURE);
+    }
+
+    if (out.device != "cpu" && out.device != "gpu" && out.device != "auto") {
+        fprintf(stderr, "Invalid device '%s'. Must be 'cpu', 'gpu' or 'auto'\n", out.device.c_str());
+        exit(EXIT_FAILURE);
+    }
+
+    return out;
+}
+
+static std::vector<const backend_test_case *> collect_tests_to_run(const std::string & requested) {
+    std::vector<const backend_test_case *> selected;
+
+    if (!requested.empty()) {
+        for (const auto & test : BACKEND_TESTS) {
+            if (test.name == requested) {
+                selected.push_back(&test);
+                break;
+            }
+        }
+        if (selected.empty()) {
+            fprintf(stderr, "Unknown test '%s'. Available tests:\n", requested.c_str());
+            for (const auto & test : BACKEND_TESTS) {
+                fprintf(stderr, "  %s\n", test.name.c_str());
+            }
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        for (const auto & test : BACKEND_TESTS) {
+            if (test.enabled_by_default) {
+                selected.push_back(&test);
+            }
+        }
+    }
+
+    if (selected.empty()) {
+        fprintf(stderr, "No backend sampling tests selected. Use --test=<name> to pick one.\n");
+    }
+
+    return selected;
+}
+
+static void run_tests(const std::vector<const backend_test_case *> & tests, const test_params & args) {
+    for (const auto & test : tests) {
+        fprintf(stderr, "\n=== %s ===\n", test->name.c_str());
+        try {
+            test->fn(args);
+        } catch (const std::exception & e) {
+            fprintf(stderr, "Error running test '%s': %s\n", test->name.c_str(), e.what());
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
+int main(int argc, char ** argv) {
+    test_args args = parse_cli(argc, argv);
+
+    if (args.model.empty()) {
+        args.model = get_model_or_exit(1, argv);
+    }
+
+    {
+        std::ifstream file(args.model);
+        if (!file.is_open()) {
+            fprintf(stderr, "no model '%s' found\n", args.model.c_str());
+            return EXIT_FAILURE;
+        }
+    }
+
+    fprintf(stderr, "using '%s'\n", args.model.c_str());
+
+    llama_backend_init();
+
+    test_params params = {
+        /*.model =*/ load_model(args),
+    };
+
+    const std::vector<const backend_test_case *> tests = collect_tests_to_run(args.test);
+    if (!tests.empty()) {
+        run_tests(tests, params);
+    }
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-barrier.cpp b/llama.cpp/tests/test-barrier.cpp
new file mode 100644
index 0000000..61f73ad
--- /dev/null
+++ b/llama.cpp/tests/test-barrier.cpp
@@ -0,0 +1,236 @@
+#include "ggml.h"
+#include "ggml-cpu.h"
+
+#include <chrono>
+#include <iostream>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <vector>
+#include <thread>
+
+#define MAX_NARGS 2
+
+static void test_barrier(int n_threads, int n_rounds) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 1024*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+
+    // Create graph
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+
+    // Lots of small, parallel ops where barriers in between will dominate
+    struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
+    for (int i = 0; i < 1000; i++) {
+        struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
+        out = ggml_mul_mat(ctx, a, out);
+
+        struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
+        out = ggml_mul_mat(ctx, d, out);
+    }
+
+    ggml_build_forward_expand(gf, out);
+    int n_nodes = ggml_graph_n_nodes(gf);
+
+    // Create threadpool
+    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
+    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
+    if (!threadpool) {
+        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
+        exit(1);
+    }
+
+    // The test runs with constant number of threads
+    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
+
+    std::vector<uint8_t> work_data(cplan.work_size);
+    cplan.work_data = work_data.data();
+
+    std::cerr << "graph-compute with"
+              << "\n n_threads: " << n_threads
+              << "\n   n_nodes: " << n_nodes
+              << "\n  n_rounds: " << n_rounds
+              << "\n";
+    // ggml_graph_print(gf);
+
+    // Warmup
+    ggml_graph_compute(gf, &cplan);
+
+    auto t0 = std::chrono::high_resolution_clock::now();
+
+    for (int i=0; i < n_rounds; i++) {
+        ggml_graph_compute(gf, &cplan);
+    }
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+
+    auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
+    auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
+    std::cerr << "graph-compute took " << usec << " usec "
+              << "\n " << (float) usec / n_rounds << " usec per-iter"
+              << "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
+              << "\n";
+
+    ggml_threadpool_free(threadpool);
+    ggml_free(ctx);
+}
+
+static void test_active(int n_threads, int n_rounds) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 1024*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+
+    // Create graph
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+
+    // Small graph with, parallel ops with barriers
+    struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
+    for (int i = 0; i < 2; i++) {
+        struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
+        out = ggml_mul_mat(ctx, a, out);
+
+        struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
+        out = ggml_mul_mat(ctx, d, out);
+    }
+
+    ggml_build_forward_expand(gf, out);
+    int n_nodes = ggml_graph_n_nodes(gf);
+
+    // Create threadpool
+    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
+    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
+    if (!threadpool) {
+        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
+        exit(1);
+    }
+
+    std::cerr << "graph-compute with"
+              << "\n n_threads: " << n_threads
+              << "\n   n_nodes: " << n_nodes
+              << "\n  n_rounds: " << n_rounds
+              << "\n";
+    // ggml_graph_print(gf);
+
+    // In this test we keep changing the number of threads every 4th iteration
+    // to test for race conditions in that path
+
+    for (int i=0; i < n_rounds; i++) {
+        struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
+
+        std::vector<uint8_t> work_data(cplan.work_size);
+        cplan.work_data = work_data.data();
+
+        ggml_graph_compute(gf, &cplan);
+    }
+
+    ggml_threadpool_free(threadpool);
+    ggml_free(ctx);
+}
+
+static void test_multi_graph(int n_threads, int n_rounds) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 1024*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+
+    // Create graphs
+    struct ggml_cgraph * gf0 = ggml_new_graph(ctx);
+    {
+        // Small graph with parallel ops with barriers
+        struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
+        for (int i = 0; i < 2; i++) {
+            struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
+            out = ggml_mul_mat(ctx, a, out);
+
+            struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
+            out = ggml_mul_mat(ctx, d, out);
+        }
+
+        ggml_build_forward_expand(gf0, out);
+    }
+
+    struct ggml_cgraph * gf1 = ggml_new_graph(ctx);
+    {
+        // Small graph with parallel ops with barriers
+        // Use larger tensors to make sure work_data size is larger than gf0
+        struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  256);
+        for (int i = 0; i < 4; i++) {
+            struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 256, 128);
+            out = ggml_mul_mat(ctx, a, out);
+
+            struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 256);
+            out = ggml_mul_mat(ctx, d, out);
+        }
+
+        ggml_build_forward_expand(gf1, out);
+    }
+
+
+    // Create threadpool
+    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
+    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
+    if (!threadpool) {
+        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
+        exit(1);
+    }
+
+    std::cerr << "graph-compute with"
+              << "\n gf0 n_nodes: " << ggml_graph_n_nodes(gf0)
+              << "\n gf1 n_nodes: " << ggml_graph_n_nodes(gf1)
+              << "\n   n_threads: " << n_threads
+              << "\n    n_rounds: " << n_rounds
+              << "\n";
+
+    // In this test we keep changing the number of threads every 4th iteration
+    // and we compute two graphs back to back to test graph frequent graph switching
+
+    for (int i=0; i < n_rounds; i++) {
+        struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
+        std::vector<uint8_t> work_data0(cplan0.work_size);
+        cplan0.work_data = work_data0.data();
+
+        struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
+        std::vector<uint8_t> work_data1(cplan1.work_size);
+        cplan1.work_data = work_data1.data();
+
+        ggml_graph_compute(gf0, &cplan0);
+        ggml_graph_compute(gf1, &cplan1);
+    }
+
+    ggml_threadpool_free(threadpool);
+    ggml_free(ctx);
+}
+
+
+int main(int argc, char *argv[]) {
+
+    int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency()));
+    int n_rounds  = 100;
+
+    if (argc > 1) {
+        n_threads = std::atoi(argv[1]);
+    }
+
+    if (argc > 2) {
+        n_rounds  = std::atoi(argv[2]);
+    }
+
+    test_barrier(n_threads, n_rounds);
+
+    test_active(n_threads,  n_rounds * 100);
+
+    test_multi_graph(n_threads,  n_rounds * 10);
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-c.c b/llama.cpp/tests/test-c.c
new file mode 100644
index 0000000..a050710
--- /dev/null
+++ b/llama.cpp/tests/test-c.c
@@ -0,0 +1,3 @@
+#include "llama.h"
+
+int main(void) {}
diff --git a/llama.cpp/tests/test-chat-parser.cpp b/llama.cpp/tests/test-chat-parser.cpp
new file mode 100644
index 0000000..6f44a2b
--- /dev/null
+++ b/llama.cpp/tests/test-chat-parser.cpp
@@ -0,0 +1,617 @@
+//  Tests chat handling, including grammar generation and parsing for tool calling, for various templates.
+//
+//  Also acts as a CLI to generate a Markdown summary of the formats of Jinja templates,
+//  e.g. given Minja (http://github.com/google/minja) checked out in parent dir:
+//
+//    cmake -B build && cmake --build build --parallel && ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
+//
+#include <exception>
+#include <iostream>
+#include <string>
+
+#include "chat-parser.h"
+#include "common.h"
+#include "log.h"
+#include "regex-partial.h"
+
+template <class T>
+static void assert_equals(const std::string_view label, const T & expected, const T & actual) {
+    if (expected != actual) {
+        std::cerr << label << std::endl;
+        std::cerr << "Expected: " << expected << std::endl;
+        std::cerr << "Actual: " << actual << std::endl;
+        std::cerr << std::flush;
+        throw std::runtime_error("Test failed");
+    }
+}
+
+template <class T>
+static void assert_equals(const T & expected, const T & actual) {
+    assert_equals("", expected, actual);
+}
+static void assert_equals(const char * expected, const std::string & actual) {
+  return assert_equals<std::string>(expected, actual);
+}
+
+static void assert_throws(const std::function<void()> & fn, const std::string & expected_exception_pattern = "") {
+    try {
+        fn();
+    } catch (const std::exception & e) {
+      if (expected_exception_pattern.empty()) {
+          return;
+        }
+        std::regex expected_exception_regex(expected_exception_pattern);
+        std::string actual_message = e.what();
+        if (std::regex_search(actual_message, expected_exception_regex)) {
+            return;
+        }
+        throw std::runtime_error("Exception doesn't match expected pattern: " + actual_message + " (pattern: " + expected_exception_pattern + ")");
+        throw std::runtime_error("Exception of unexpected type: " + std::string(e.what()));
+    }
+    throw std::runtime_error("Exception was expected but not thrown");
+}
+
+static void test_reasoning() {
+  //common_log_set_verbosity_thold(LOG_DEFAULT_DEBUG);
+  {
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = false;
+    common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
+    assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
+    assert_equals("<tnk>Cogito</tnk>Ergo sum", builder.consume_rest());
+  }
+  {
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = false;
+    common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
+    assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
+    assert_equals(std::string("Cogito"), builder.result().reasoning_content);
+    assert_equals("Ergo sum", builder.consume_rest());
+  }
+  {
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = false;
+    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
+    assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
+    assert_equals("Cogito</tnk>Ergo sum", builder.consume_rest());
+  }
+  {
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = true;
+    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
+    assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
+    assert_equals(std::string("Cogito"), builder.result().reasoning_content);
+    assert_equals("Ergo sum", builder.consume_rest());
+  }
+  {
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = true;
+    params.thinking_forced_open = true;
+    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
+    assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
+    assert_equals("<think>Cogito</think>", builder.result().content);
+    assert_equals("Ergo sum", builder.consume_rest());
+  }
+  {
+    const std::string variant("content_only_inline_think");
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = false;
+    params.parse_tool_calls = false;
+    const std::string input = "<think>Pense</think>Bonjour";
+    auto msg = common_chat_parse(input, false, params);
+    assert_equals(variant, std::string("Pense"), msg.reasoning_content);
+    assert_equals(variant, std::string("Bonjour"), msg.content);
+  }
+  {
+    const std::string variant("llama_3_inline_think");
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_LLAMA_3_X;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = false;
+    params.parse_tool_calls = false;
+    const std::string input = "<think>Plan</think>Réponse";
+    auto msg = common_chat_parse(input, false, params);
+    assert_equals(variant, std::string("Plan"), msg.reasoning_content);
+    assert_equals(variant, std::string("Réponse"), msg.content);
+  }
+  // Test DeepSeek V3.1 parsing - reasoning content followed by "</think>" and then regular content
+  {
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = true;
+    params.parse_tool_calls = true;
+    const std::string variant("deepseek_v3_1_reasoning_format_deepseek");
+    common_chat_msg_parser builder("REASONING</think>ok", /* is_partial= */ false, params);
+    assert_equals(variant, true, builder.try_parse_reasoning("<think>", "</think>"));
+    assert_equals(variant, std::string("REASONING"), builder.result().reasoning_content);
+    assert_equals(variant, std::string("ok"), builder.consume_rest());
+  }
+  // Test DeepSeek V3.1 parsing - reasoning_format none - reasoning content followed by "</think>" and then regular content
+  {
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+    params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = true;
+    params.parse_tool_calls = true;
+    const std::string variant("deepseek_v3_1_reasoning_format_none");
+    const std::string input = "REASONING</think>ok";
+    auto msg = common_chat_parse(input, false, params);
+    assert_equals(variant, std::string("REASONING</think>ok"), msg.content);
+    assert_equals(variant, std::string(""), msg.reasoning_content);
+  }
+}
+
+static void test_regex() {
+  auto test_throws = [](const std::string & input, const std::string & regex, const std::string & expected_exception_pattern = "") {
+    common_chat_msg_parser builder(input, /* is_partial= */ false, {});
+    assert_throws([&]() { builder.consume_regex(common_regex(regex)); }, expected_exception_pattern);
+  };
+
+  test_throws("Hello, world!", "abc", "^abc$");
+  test_throws("Hello, world!", "e", "^e$");
+
+  {
+    common_chat_msg_parser builder("Hello, world!", /* is_partial= */ false, {});
+    builder.consume_regex(common_regex("Hello"));
+    assert_equals(", world!", builder.consume_rest());
+  }
+
+  {
+    // When in non partial mode, we can say whether the regex was consumed or not.
+    common_chat_msg_parser builder("Hello,", /* is_partial= */ false, {});
+    assert_equals(false, builder.try_consume_regex(common_regex("Hello, world!")).has_value());
+  }
+  {
+    common_chat_msg_parser builder("Hello,", /* is_partial= */ false, {});
+    auto res = builder.try_consume_regex(common_regex("H(el)l(?:o, world!)?"));
+    assert_equals(true, res.has_value());
+    // Verify captures
+    assert_equals<size_t>(2, res->groups.size());
+    assert_equals("Hell", builder.str(res->groups[0]));
+    assert_equals("el", builder.str(res->groups[1]));
+    // Verify position is after the match
+    assert_equals<size_t>(4, builder.pos());
+    assert_equals("o,", builder.consume_rest());
+  }
+  {
+    // But in partial mode, we have a partial final match / can't decide, so we throw a partial exception.
+    common_chat_msg_parser builder("Hello,", /* is_partial= */ true, {});
+    assert_throws([&]() {
+      builder.try_consume_regex(common_regex("Hello, world!"));
+    }, "^Hello, world!$");
+  }
+
+  // Now regardless of the mode, we can tell these aren't a match.
+  for (const auto is_partial : {false, true}) {
+    common_chat_msg_parser builder("Hello,", is_partial, {});
+    assert_equals(false, builder.try_consume_regex(common_regex("a(b|c)(d|e)f")).has_value());
+  }
+  for (const auto is_partial : {false, true}) {
+    common_chat_msg_parser builder("Hello,", is_partial, {});
+    assert_equals(false, builder.try_consume_literal("Oh"));
+  }
+}
+
+const std::vector<std::string> barely_healable_jsons = {
+  "{",
+  "{\"",
+  "{\"\\",
+  "{\"n",
+  "{\"name\"",
+  "{\"name\":",
+  "{\"name\":\"",
+  "{\"name\":\"\\",
+  "{\"name\":\"python",
+  "{\"name\":\"python\\",
+  "{\",",
+  "{\":",
+  "{\"[",
+  "{\"]",
+  "{\"{",
+  "{\"}",
+  "{\"1",
+  "{\"name\":\",",
+  "{\"name\":\":",
+  "{\"name\":\"[",
+  "{\"name\":\"]",
+  "{\"name\":\"{",
+  "{\"name\":\"}",
+  "{\"name\":\"1",
+};
+
+static void test(const std::string & input, bool is_partial, const std::vector<std::vector<std::string>> & args_paths, const std::vector<std::vector<std::string>> & content_paths, const std::string & expected) {
+  common_chat_msg_parser builder(input, is_partial, {});
+  auto js = builder.try_consume_json_with_dumped_args(args_paths, content_paths);
+  assert_equals(true, js.has_value());
+  assert_equals(is_partial, js->is_partial);
+  assert_equals(expected, args_paths.size() == 1 && args_paths[0].empty() ? js->value.get<std::string>() : js->value.dump());
+}
+
+static void test_deepseek_v3_1_tool_calls() {
+    //common_log_set_verbosity_thold(LOG_DEFAULT_DEBUG);
+    // variant: happy path for when it works as the model card says it should
+    const std::string variant("simple");
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = false;
+    params.parse_tool_calls = true;
+    const std::string input = "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>";
+    auto msg = common_chat_parse(input, false, params);
+    assert_equals<std::size_t>(variant, 1, msg.tool_calls.size());
+    assert_equals(variant, std::string("get_time"), msg.tool_calls[0].name);
+    // JSON arguments are dumped without spaces
+    assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), msg.tool_calls[0].arguments);
+    assert_equals(variant, std::string(""), msg.content);
+    assert_equals(variant, std::string(""), msg.reasoning_content);
+
+    // variant: simple + thinking open
+    {
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = true;
+        params.parse_tool_calls = true;
+        const std::string variant("simple_thinking");
+        const std::string in = "REASONING</think><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>";
+        auto m = common_chat_parse(in, false, params);
+        assert_equals<std::size_t>(variant, 1, m.tool_calls.size());
+        assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
+        assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), m.tool_calls[0].arguments);
+        assert_equals(variant, std::string(""), m.content);
+        assert_equals(variant, std::string("REASONING"), m.reasoning_content);
+    }
+    // variant: simple + multiple tool calls
+    {
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = false;
+        params.parse_tool_calls = true;
+        const std::string variant("simple_multiple_tool_calls");
+        const std::string in = "CONTENT<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Paris\"}<｜tool▁call▁end｜><｜tool▁call▁begin｜>get_weather<｜tool▁sep｜>{\"city\": \"Paris\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>";
+        auto m = common_chat_parse(in, false, params);
+        assert_equals<std::size_t>(variant, 2, m.tool_calls.size());
+        assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
+        assert_equals(variant, std::string("{\"city\":\"Paris\"}"), m.tool_calls[0].arguments);
+        assert_equals(variant, std::string("get_weather"), m.tool_calls[1].name);
+        assert_equals(variant, std::string("{\"city\":\"Paris\"}"), m.tool_calls[1].arguments);
+        assert_equals(variant, std::string("CONTENT"), m.content);
+        assert_equals(variant, std::string(""), m.reasoning_content);
+    }
+
+
+    // variant: thinking forced open + tool call in reasoning content
+    {
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = true;
+        params.parse_tool_calls = true;
+        const std::string variant("thinking_forced_open_tool_call_in_reasoning");
+        const std::string in = "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time2<｜tool▁sep｜>{\"city\": \"Tokyo2\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>REASONING</think><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>";
+        auto m = common_chat_parse(in, false, params);
+        assert_equals<std::size_t>(variant, 1, m.tool_calls.size());
+        assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
+        assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), m.tool_calls[0].arguments);
+        assert_equals(variant, std::string(""), m.content);
+        assert_equals(variant, std::string("REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time2<｜tool▁sep｜>{\"city\": \"Tokyo2\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>REASONING"), m.reasoning_content);
+    }
+
+    // variant: thinking forced open + tool call in reasoning content + no closing think + not partial
+    //          This is a bit of a fine tuning issue on the model's part IMO. It really should not be attempting
+    //          to make tool calls in reasoning content according to the model card, but it does sometimes, so
+    //          add the reasoning content as regular content and parse the tool calls.
+    {
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = true;
+        params.parse_tool_calls = true;
+        const std::string variant("thinking_forced_open_tool_call_in_reasoning_no_closing_think_not_partial");
+        const std::string in = "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>";
+        auto m = common_chat_parse(in, false, params);
+        assert_equals(variant, std::string("REASONING"), m.content);
+        assert_equals(variant, std::string(""), m.reasoning_content);
+        assert_equals<std::size_t>(variant, 1, m.tool_calls.size());
+        assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
+        assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), m.tool_calls[0].arguments);
+    }
+
+    // variant: thinking forced open + tool call in reasoning content + no closing think + partial
+    {
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = true;
+        params.parse_tool_calls = true;
+        const std::string variant("thinking_forced_open_tool_call_in_reasoning_no_closing_think_partial");
+        const std::string in = "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>";
+        auto m = common_chat_parse(in, /* is_partial= */ true, params);
+        assert_equals(variant, std::string("REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>"), m.reasoning_content);
+        assert_equals(variant, std::string(""), m.content);
+        assert_equals<std::size_t>(variant, 0, m.tool_calls.size());
+    }
+
+    // variant: thinking not forced open + reasoning + regular content + no tool calls
+    {
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = true;
+        params.parse_tool_calls = true;
+        const std::string variant("thinking_forced_open_reasoning_regular_content_no_tool_calls");
+        const std::string in = "REASONING</think>CONTENT";
+        auto m = common_chat_parse(in, false, params);
+        assert_equals<std::size_t>(variant, 0, m.tool_calls.size());
+        assert_equals(variant, std::string("CONTENT"), m.content);
+        assert_equals(variant, std::string("REASONING"), m.reasoning_content);
+    }
+    // variant: thinking not forced open + missing reasoning + no tool calls
+    {
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = false;
+        params.parse_tool_calls = true;
+        const std::string variant("thinking_not_forced_open_missing_reasoning_no_tool_calls");
+        const std::string in = "CONTENT";
+        auto m = common_chat_parse(in, false, params);
+        assert_equals<std::size_t>(variant, 0, m.tool_calls.size());
+        assert_equals(variant, std::string("CONTENT"), m.content);
+        assert_equals(variant, std::string(""), m.reasoning_content);
+    }
+}
+
+static void test_with_args(const std::string & input, const std::string & expected, bool parse_as_partial = true, bool is_partial = true) {
+  common_chat_msg_parser builder(input, parse_as_partial, {});
+  auto js = builder.try_consume_json_with_dumped_args({{"args"}}, {});
+  assert_equals(true, js.has_value());
+  assert_equals(is_partial, js->is_partial);
+  assert_equals(expected, js->value.dump());
+}
+
+static void test_json_with_dumped_args_no_args() {
+  // Normal JSON, nothing to heal, nothing to dump
+  test("{\"name\": \"python\"}", false, {}, {}, "{\"name\":\"python\"}");
+  // Full json is args
+  test("{\"name\": \"python\"}", false, {{}}, {}, "{\"name\":\"python\"}");
+
+  // If the arguments are further down, don't heal partial content.
+  for (const auto & src : barely_healable_jsons) {
+    test(src, true, {{"arguments"}}, {}, "{}");
+  }
+  // But heal content that isn't partial.
+  test("{\"name\": \"python\"", true, {{"arguments"}}, {}, "{\"name\":\"python\"}");
+}
+
+static void test_json_with_dumped_args() {
+
+  // Partial content.
+  test("{\"content\": \"t", true, {}, {{"content"}}, "{\"content\":\"t\"}");
+  test("{\"content\": \"", true, {}, {{"content"}}, "{\"content\":\"\"}");
+  test("{\"content\": ", true, {}, {{"content"}}, "{}");
+
+  // If the entire JSON is the arguments, healing it them dumping it produces the same output as the input (just reformatted).
+  test("{\"name\": \"python", true, {{}}, {}, "{\"name\":\"python");
+  for (const auto & src : barely_healable_jsons) {
+    test(src, true, {{}}, {}, src);
+  }
+
+  // Full JSON w/ args
+  for (auto parse_as_partial : {true, false}) {
+    test_with_args(
+      R"({"name": "python", "args": {"arg1": 1}})",
+      R"({"name":"python","args":"{\"arg1\":1}"})",
+      parse_as_partial,
+      /* is_partial= */ false
+    );
+  }
+
+  // Partial JSON w/ partial args
+  test_with_args(
+    R"({"foo": "bar", "args": {")",
+    R"({"foo":"bar","args":"{\""})"
+  );
+  // Partial args broken in object key
+  test_with_args(
+    R"({"foo": "bar", "args": {"ar)",
+    R"({"foo":"bar","args":"{\"ar"})"
+  );
+  // Partial args broken after object key
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1")",
+    R"({"foo":"bar","args":"{\"arg1\""})"
+  );
+  // Partial args broken before object value
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1":)",
+    R"({"foo":"bar","args":"{\"arg1\":"})"
+  );
+  // Partial args broken before object value (space)
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": )",
+    R"({"foo":"bar","args":"{\"arg1\":"})"
+  );
+  // Partial args broken in object value that may not be complete (int)
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": 1)",
+    R"({"foo":"bar","args":"{\"arg1\":"})"
+  );
+  // Partial args broken in object value that is complete (int)
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": 1 )",
+    R"({"foo":"bar","args":"{\"arg1\":1"})"
+  );
+  // Partial args broken in object value that is incomplete (string)
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": ")",
+    R"({"foo":"bar","args":"{\"arg1\":\""})"
+  );
+  // Partial args broken in object value that is complete (string)
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "1")",
+    R"({"foo":"bar","args":"{\"arg1\":\"1\""})"
+  );
+  // Partial args broken on array opening
+  test_with_args(
+    R"({"foo": "bar", "args": [)",
+    R"({"foo":"bar","args":"["})"
+  );
+  // Partial args broken on array value that is incomplete (int)
+  test_with_args(
+    R"({"foo": "bar", "args": [1)",
+    R"({"foo":"bar","args":"["})"
+  );
+  // Partial args broken on array value that is complete (int)
+  test_with_args(
+    R"({"foo": "bar", "args": [1 )",
+    R"({"foo":"bar","args":"[1"})"
+  );
+  // Partial args broken on array value that is complete (string)
+  test_with_args(
+    R"({"foo": "bar", "args": ["1")",
+    R"({"foo":"bar","args":"[\"1\""})"
+  );
+  // Partial args broken after array value
+  test_with_args(
+    R"({"foo": "bar", "args": [1,)",
+    R"({"foo":"bar","args":"[1,"})"
+  );
+  // Partial args broken on nested array
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": [)",
+    R"({"foo":"bar","args":"{\"arg1\":["})"
+  );
+
+  // Unicode tests
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\u)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\u"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\u0)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\u0"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\u00)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\u00"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\u000)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\u000"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\u0000)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\u0000"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\ud8)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\ud8"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\ud80)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\ud80"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\ud800)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\ud800"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\ud800\)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\ud800\\"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\ud800\u)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\ud800\\u"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\ud800\ud)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\ud800\\ud"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\ud800\udc)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\ud800\\udc"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\ud800\udc0)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\ud800\\udc0"})"
+  );
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "\ud800\udc00)",
+    R"({"foo":"bar","args":"{\"arg1\":\"\\ud800\\udc00"})"
+  );
+}
+
+static void test_positions() {
+  {
+    common_chat_msg_parser builder("Hello, world!", /* is_partial= */ false, {});
+    assert_equals<size_t>(0, builder.pos());
+    assert_throws([&]() { builder.move_to(100); });
+    assert_equals<size_t>(0, builder.pos());
+    assert_throws([&]() { builder.move_back(1); });
+    assert_equals<size_t>(0, builder.pos());
+
+    builder.move_to(8);
+    assert_equals<size_t>(8, builder.pos());
+    builder.move_back(1);
+    assert_equals<size_t>(7, builder.pos());
+    assert_equals("world!", builder.consume_rest());
+
+    builder.move_to(0);
+    assert_equals<size_t>(0, builder.pos());
+
+    assert_throws([&]() { builder.finish(); });
+    assert_equals<size_t>(0, builder.pos());
+
+    builder.move_to(builder.input().size());
+    builder.finish();
+  }
+  {
+    common_chat_msg_parser builder("Hello, world!", /* is_partial= */ true, {});
+
+    builder.move_to(builder.input().size());
+    assert_equals<size_t>(builder.input().size(), builder.pos());
+    builder.finish();
+  }
+}
+
+int main() {
+    test_positions();
+    test_json_with_dumped_args_no_args();
+    test_json_with_dumped_args();
+    test_reasoning();
+    test_regex();
+    test_deepseek_v3_1_tool_calls();
+    std::cout << "All tests passed!\n";
+    return 0;
+}
diff --git a/llama.cpp/tests/test-chat-peg-parser.cpp b/llama.cpp/tests/test-chat-peg-parser.cpp
new file mode 100644
index 0000000..f767c73
--- /dev/null
+++ b/llama.cpp/tests/test-chat-peg-parser.cpp
@@ -0,0 +1,768 @@
+#include <string>
+#include <iostream>
+#include <numeric>
+
+#include "chat-parser.h"
+#include "chat-peg-parser.h"
+#include "chat.h"
+#include "common.h"
+#include "json-schema-to-grammar.h"
+#include "peg-parser.h"
+#include "testing.h"
+#include "peg-parser/simple-tokenize.h"
+#include "nlohmann/json.hpp"
+
+using json = nlohmann::ordered_json;
+
+static json create_tools();
+static void test_example_native(testing & t);
+static void test_example_qwen3_coder(testing & t);
+static void test_command7_parser_compare(testing & t);
+
+int main(int argc, char *argv[]) {
+    testing t(std::cout);
+    if (argc >= 2) {
+        t.set_filter(argv[1]);
+    }
+
+    const char * verbose = getenv("LLAMA_TEST_VERBOSE");
+    if (verbose) {
+        t.verbose = std::string(verbose) == "1";
+    }
+
+    t.test("native", test_example_native);
+    t.test("qwen3 coder", test_example_qwen3_coder);
+    t.test("comparison", test_command7_parser_compare);
+
+    return t.summary();
+}
+
+static json create_tools() {
+    json tools = json::array();
+
+    json tool_weather = {
+        {"type", "function"},
+        {"function", {
+            {"name", "get_current_weather"},
+            {"description", "Get the current weather in a given location"},
+            {"parameters", {
+                {"type", "object"},
+                {"properties", {
+                    {"location", {
+                        {"type", "string"},
+                        {"description", "The city and state, e.g. San Francisco, CA"}
+                    }},
+                    {"unit", {
+                        {"type", "string"},
+                        {"enum", {"celsius", "fahrenheit"}},
+                        {"description", "The temperature unit to use. Infer this from the users location."}
+                    }}
+                }},
+                {"required", {"location", "unit"}},
+            }},
+        }}
+    };
+    tools.push_back(tool_weather);
+
+    json tool_forecast = {
+        {"type", "function"},
+        {"function", {
+            {"name", "get_forecast"},
+            {"description", "Get the weather forecast for a given location"},
+            {"parameters", {
+                {"type", "object"},
+                {"properties", {
+                    {"location", {
+                        {"type", "string"},
+                        {"description", "The city and state, e.g. San Francisco, CA"}
+                    }},
+                    {"unit", {
+                        {"type", "string"},
+                        {"enum", {"celsius", "fahrenheit"}},
+                        {"description", "The temperature unit to use. Infer this from the users location."}
+                    }},
+                    {"days", {
+                        {"type", "integer"},
+                        {"description", "Number of days to forecast (1-10)"},
+                        {"minimum", 1},
+                        {"maximum", 10}
+                    }}
+                }},
+                {"required", {"location", "unit"}},
+            }},
+        }}
+    };
+    tools.push_back(tool_forecast);
+
+    json tool_search = {
+        {"type", "function"},
+        {"function", {
+            {"name", "search_knowledge_base"},
+            {"description", "Search the internal technical documentation knowledge base."},
+            {"parameters", {
+                {"type", "object"},
+                {"properties", {
+                    {"query", {
+                        {"type", "string"},
+                        {"description", "The search query string."}
+                    }},
+                    {"max_results", {
+                        {"type", "integer"},
+                        {"description", "The maximum number of results to return."},
+                        {"default", 5}
+                    }},
+                    {"category", {
+                        {"type", "string"},
+                        {"enum", {"api", "troubleshooting", "billing", "general"}},
+                        {"description", "Filter search by specific category."}
+                    }}
+                }},
+                {"required", {"query", "category"}},
+                {"additionalProperties", false}
+            }},
+            {"strict", true}
+        }}
+    };
+    tools.push_back(tool_search);
+
+    return tools;
+}
+
+struct tool_argument {
+    std::string name;
+    std::string type;
+    bool is_required;
+    json schema;
+};
+
+struct tool_definition {
+    std::string name;
+    std::vector<tool_argument> arguments;
+    json schema;
+};
+
+// Test fictitious model output that emits arguments as JSON.
+static void test_example_native(testing & t) {
+    struct test_case {
+        // Parameters
+        std::string name;
+        json tools;
+        common_chat_tool_choice tool_choice;
+        common_reasoning_format reasoning_format;
+        json json_schema;
+        bool parallel_tool_calls;
+        bool thinking_forced_open;
+        std::string input;
+
+        // Expect
+        std::string expect_reasoning;
+        std::string expect_content;
+        std::vector<common_chat_tool_call> expect_tool_calls;
+    };
+
+    auto build_parser = [](const test_case & tc) {
+        return build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
+            auto reasoning_in_content = (tc.reasoning_format == COMMON_REASONING_FORMAT_NONE);
+            auto reasoning = p.eps();
+            if (tc.thinking_forced_open) {
+                // If thinking is forced open, expect a closing tag
+                reasoning = p.reasoning(p.until("</think>")) + "</think>" + p.space();
+            } else {
+                // Otherwise, optionally accept thinking wrapped in tags
+                reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());
+            }
+
+            // tool calling parser
+            if (tc.tools.is_array() && !tc.tools.empty()) {
+                auto tools = p.choice();
+                for (const auto & tool : tc.tools) {
+                    const auto & function = tool.at("function");
+                    std::string name = function.at("name");
+                    const auto & schema = function.at("parameters");
+
+                    auto tool_name = p.json_member("name", "\"" + p.tool_name(p.literal(name)) + "\"");
+                    auto tool_args = p.json_member("arguments", p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema)));
+
+                    tools |= p.rule("tool-" + name, p.tool_open(p.literal("{")) << tool_name << "," << tool_args << "}");
+                };
+
+                auto parallel_calls = p.eps();
+                if (tc.parallel_tool_calls) {
+                    parallel_calls = p.zero_or_more("," << tools);
+                }
+
+                auto tool_call = p.trigger_rule("tool-call",
+                    p.sequence({
+                        p.literal("<tool_call>["),
+                        tools,
+                        parallel_calls,
+                        p.literal("]</tool_call>")
+                    })
+                );
+
+                return p.sequence({
+                    (reasoning_in_content ? p.eps() : reasoning),
+                    p.content(p.until("<tool_call>")),
+                    p.optional(p.space() + tool_call),
+                    p.space(),
+                    p.end()
+                });
+            }
+
+            // response_format parser
+            if (tc.json_schema.is_object() && !tc.json_schema.empty()) {
+                return p.sequence({
+                    (reasoning_in_content ? p.eps() : reasoning),
+                    p.content(p.schema(p.json(), "response-output", tc.json_schema)),
+                    p.space(),
+                    p.end()
+                });
+            }
+
+            // Content-only parser
+            return p.sequence({
+                (reasoning_in_content ? p.eps() : reasoning),
+                p.content(p.rest()),
+                p.end()
+            });
+        });
+    };
+
+    std::vector<test_case> test_cases = std::vector<test_case>{
+        {
+            /* .name =                 */ "content with thinking_forced_open = false",
+            /* .tools =                */ {},
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ false,
+            /* .input =                */ (
+                "<think>The user said hello, I must say hello back</think>\nHello"
+            ),
+            /* .expect_reasoning =     */ "The user said hello, I must say hello back",
+            /* .expect_content =       */ "Hello",
+            /* .expect_tool_calls =    */ {},
+        },
+        {
+            /* .name =                 */ "content with thinking_forced_open = false and no reasoning",
+            /* .tools =                */ {},
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ false,
+            /* .input =                */ (
+                "Hello"
+            ),
+            /* .expect_reasoning =     */ "",
+            /* .expect_content =       */ "Hello",
+            /* .expect_tool_calls =    */ {},
+        },
+        {
+            /* .name =                 */ "content with thinking_forced_open = false and reasoning_format = none",
+            /* .tools =                */ {},
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ true,
+            /* .input =                */ (
+                "<think>The user said hello, I must say hello back</think>\nHello"
+            ),
+            /* .expect_reasoning =     */ "",
+            /* .expect_content =       */ "<think>The user said hello, I must say hello back</think>\nHello",
+            /* .expect_tool_calls =    */ {},
+        },
+        {
+            /* .name =                 */ "content with thinking_forced_open = true",
+            /* .tools =                */ {},
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ true,
+            /* .input =                */ (
+                "The user said hello, I must say hello back</think>\nHello"
+            ),
+            /* .expect_reasoning =     */ "The user said hello, I must say hello back",
+            /* .expect_content =       */ "Hello",
+            /* .expect_tool_calls =    */ {},
+        },
+        {
+            /* .name =                 */ "content with thinking_forced_open = true and reasoning_format = none",
+            /* .tools =                */ {},
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ true,
+            /* .input =                */ (
+                "The user said hello, I must say hello back</think>\nHello"
+            ),
+            /* .expect_reasoning =     */ "",
+            /* .expect_content =       */ "The user said hello, I must say hello back</think>\nHello",
+            /* .expect_tool_calls =    */ {},
+        },
+        {
+            /* .name =                 */ "tools with tool_choice = auto and no parallel_tool_calls",
+            /* .tools =                */ create_tools(),
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_AUTO,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ true,
+            /* .input =                */ (
+                "I must get the weather in New York</think>\n"
+                "<tool_call>["
+                R"({"name": "get_current_weather", "arguments": {"location": "New York City, NY", "unit": "fahrenheit"}})"
+                "]</tool_call>"
+            ),
+            /* .expect_reasoning =     */ "I must get the weather in New York",
+            /* .expect_content =       */ "",
+            /* .expect_tool_calls =    */ {{
+                /* .name =      */ "get_current_weather",
+                /* .arguments = */ R"({"location": "New York City, NY", "unit": "fahrenheit"})",
+                /* .id =        */ "",
+            }},
+        },
+        {
+            /* .name =                 */ "tools with tool_choice = auto and parallel_tool_calls",
+            /* .tools =                */ create_tools(),
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_AUTO,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ true,
+            /* .thinking_forced_open = */ true,
+            /* .input =                */ (
+                "I must get the weather in New York and San Francisco and a 3 day forecast of each.</think>\nLet me search that for you."
+                "<tool_call>["
+                R"({"name": "get_current_weather", "arguments": {"location": "New York City, NY", "unit": "fahrenheit"}})"
+                ", "
+                R"({"name": "get_current_weather", "arguments": {"location": "San Francisco, CA", "unit": "fahrenheit"}})"
+                ", "
+                R"({"name": "get_forecast", "arguments": {"location": "New York City, NY", "unit": "fahrenheit", "days": 3}})"
+                ", "
+                R"({"name": "get_forecast", "arguments": {"location": "San Francisco, CA", "unit": "fahrenheit", "days": 3}})"
+                "]</tool_call>"
+            ),
+            /* .expect_reasoning =     */ "I must get the weather in New York and San Francisco and a 3 day forecast of each.",
+            /* .expect_content =       */ "Let me search that for you.",
+            /* .expect_tool_calls =    */ {{
+                /* .name =      */ "get_current_weather",
+                /* .arguments = */ R"({"location": "New York City, NY", "unit": "fahrenheit"})",
+                /* .id =        */ "",
+            }, {
+                /* .name =      */ "get_current_weather",
+                /* .arguments = */ R"({"location": "San Francisco, CA", "unit": "fahrenheit"})",
+                /* .id =        */ "",
+            }, {
+                /* .name =      */ "get_forecast",
+                /* .arguments = */ R"({"location": "New York City, NY", "unit": "fahrenheit", "days": 3})",
+                /* .id =        */ "",
+            }, {
+                /* .name =      */ "get_forecast",
+                /* .arguments = */ R"({"location": "San Francisco, CA", "unit": "fahrenheit", "days": 3})",
+                /* .id =        */ "",
+            }},
+        },
+        {
+            /* .name =                 */ "response_format with thinking_forced_open = true",
+            /* .tools =                */ {},
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+            /* .json_schema =          */ {
+                {"type", "object"},
+                {"properties", {
+                    {"invoice_number", {{"type", "string"}}},
+                    {"amount", {{"type", "number"}}},
+                    {"due_date", {{"type", "string"}}}
+                }},
+                {"required", {"invoice_number", "amount", "due_date"}}
+            },
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ true,
+            /* .input =                */ (
+                "I must produce the invoice in the requested format</think>\n"
+                R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})"
+            ),
+            /* .expect_reasoning =     */ "I must produce the invoice in the requested format",
+            /* .expect_content =       */ R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})",
+            /* .expect_tool_calls =    */ {},
+        },
+    };
+
+    for (const auto & tc : test_cases) {
+        t.test(tc.name, [&](testing & t) {
+            auto parser = build_parser(tc);
+            auto lazy = !tc.tools.empty() && tc.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+            auto grammar = build_grammar([&](const common_grammar_builder & builder) {
+                for (auto const & def : tc.tools) {
+                    auto function = def.at("function");
+                    auto parameters = function.at("parameters");
+                    builder.resolve_refs(parameters);
+                };
+                parser.build_grammar(builder, lazy);
+            });
+
+            t.log("Grammar:");
+            for (auto const & line : string_split(grammar, "\n")) {
+                t.log(line);
+            }
+
+            common_peg_parse_context ctx(tc.input, false);
+            auto result = parser.parse(ctx);
+
+            t.assert_true("success", result.success());
+
+            common_chat_msg msg;
+            auto mapper = common_chat_peg_native_mapper(msg);
+            mapper.from_ast(ctx.ast, result);
+
+            t.assert_equal("content equal", tc.expect_content, msg.content);
+            t.assert_equal("reasoning equal", tc.expect_reasoning, msg.reasoning_content);
+            t.assert_equal("number of tool calls", tc.expect_tool_calls.size(), msg.tool_calls.size());
+            for (auto i = 0u; i < std::min(tc.expect_tool_calls.size(), msg.tool_calls.size()); i++) {
+                t.assert_equal("tool name", tc.expect_tool_calls[i].name, msg.tool_calls[i].name);
+                t.assert_equal("tool args", tc.expect_tool_calls[i].arguments, msg.tool_calls[i].arguments);
+            }
+        });
+    }
+}
+
+static void test_example_qwen3_coder(testing & t) {
+    auto tools = create_tools();
+    auto parser = build_chat_peg_constructed_parser([&](common_chat_peg_constructed_builder & p) {
+        auto content = p.rule("content", p.content(p.until("<tool_call>")));
+
+        std::vector<common_peg_parser> tool_parsers;
+        for (auto const & def : tools) {
+            auto function = def.at("function");
+            std::string name = function.at("name");
+            auto parameters = function.at("parameters");
+            auto properties = parameters.at("properties");
+
+            std::set<std::string> required_properties;
+            if (function.contains("required")) {
+                function.at("required").get_to(required_properties);
+            }
+
+            std::vector<common_peg_parser> arg_parsers;
+            for (const auto & [param_name, param_schema] : properties.items()) {
+                bool is_required = required_properties.find(param_name) != required_properties.end();
+                auto type = param_schema.value("type", "object");
+
+                auto arg = p.tool_arg(p.sequence({
+                    p.tool_arg_open("<parameter=" + p.tool_arg_name(p.literal(param_name)) + ">"),
+                    (type == "string" ?
+                        p.tool_arg_string_value(
+                            p.schema(
+                                p.until_one_of({
+                                    "</parameter>\n<parameter=",
+                                    "</parameter>\n</function>"
+                                }),
+                                "tool-" + name + "-arg-" + param_name + "-schema",
+                                param_schema,
+                                true
+                            )
+                        ) : p.tool_arg_json_value(
+                            p.schema(
+                                p.json(),
+                                "tool-" + name + "-arg-" + param_name + "-schema",
+                                param_schema
+                            )
+                        )
+                    ),
+                    p.tool_arg_close(
+                        "</parameter>\n" +
+                        p.peek(p.literal("<parameter=") | p.literal("</function>"))
+                    )
+                }));
+
+                arg_parsers.push_back(is_required ?
+                    p.rule("tool-" + name + "-arg-" + param_name, arg) :
+                    p.optional(p.rule("tool-" + name + "-arg-" + param_name, arg)));
+            }
+
+            tool_parsers.push_back(p.rule("tool-" + name,
+                p.tool_open("<function=" + p.tool_name(p.literal(name)) + ">")
+                << p.sequence(arg_parsers)
+                << p.tool_close(p.literal("</function>"))
+            ));
+        };
+
+        auto tool_call = p.trigger_rule("tool-call",
+            "<tool_call>"
+            << p.choice(tool_parsers)
+            << "</tool_call>"
+        );
+
+        return content + p.zero_or_more(p.space() + tool_call) + p.end();
+    });
+
+    auto grammar = build_grammar([&](const common_grammar_builder & builder) {
+        for (auto const & def : tools) {
+            auto function = def.at("function");
+            auto parameters = function.at("parameters");
+            builder.resolve_refs(parameters);
+        };
+        parser.build_grammar(builder);
+    });
+
+    t.log("Grammar:");
+    for (auto const & line : string_split(grammar, "\n")) {
+        t.log(line);
+    }
+
+    t.test("incremental parsing", [&](testing &t) {
+        std::string input =
+            "Let me search the knowledge base for cat pictures."
+            "<tool_call>\n"
+            "<function=search_knowledge_base>\n"
+            "<parameter=query>cat pictures</parameter>\n"
+            "<parameter=category>general</parameter>\n"
+            "</function>\n"
+            "</tool_call>";
+
+        std::vector<std::string> tokens = simple_tokenize(input);
+
+        common_chat_msg prev;
+        for (auto it = tokens.begin(); it != tokens.end(); it++) {
+            std::string in = std::accumulate(tokens.begin(), it + 1, std::string());
+
+            common_peg_parse_context ctx(in, it + 1 < tokens.end());
+
+            auto result = parser.parse(ctx);
+            if (!t.assert_equal("not fail", false, result.fail())) {
+                t.log(in.substr(0, result.end) + "[failed->]" + in.substr(result.end));
+            }
+
+            common_chat_msg msg;
+            auto mapper = common_chat_peg_constructed_mapper(msg);
+            mapper.from_ast(ctx.ast, result);
+
+            //t.log("Input: " + input);
+            t.log("===========================================");
+            t.log("Iteration " + std::to_string(in.size()));
+            t.log("Reasoning: " + msg.reasoning_content);
+            t.log("Content  : " + msg.content);
+            for (const auto & tc : msg.tool_calls) {
+                t.log("Tool name: " + tc.name);
+                t.log("Tool args: " + tc.arguments);
+            }
+
+            try {
+                // This shouldn't emit any runtime errors
+                auto diffs = common_chat_msg_diff::compute_diffs(prev, msg);
+            } catch(const std::exception & e) {
+                t.log(in.substr(0, result.end) + "[failed->]" + in.substr(result.end));
+                t.assert_true(std::string("failed with ") + e.what(), false);
+            }
+
+            prev = msg;
+        }
+    });
+}
+
+void test_command7_parser_compare(testing & t) {
+    auto parser = build_chat_peg_native_parser([](common_chat_peg_native_builder & p) {
+        auto thinking = p.reasoning_block(
+            "<|START_THINKING|>" << p.reasoning(p.until("<|END_THINKING|>")) << "<|END_THINKING|>");
+
+        auto response = "<|START_RESPONSE|>" << p.content(p.until("<|END_RESPONSE|>")) << "<|END_RESPONSE|>";
+
+        auto tool_call_id = p.atomic("\"tool_call_id\"" << (":" << ("\"" + p.tool_id(p.json_string_content()) + "\"")));
+        auto tool_call_name = p.atomic("\"tool_name\"" << (":" << ("\"" + p.tool_name(p.json_string_content()) + "\"")));
+        auto tool_call_args = "\"parameters\"" << (":" << p.tool_args(p.json()));
+
+        auto tool_call_fields = p.rule("tool-call-fields", tool_call_id | tool_call_name | tool_call_args);
+        auto tool_call = p.rule("tool-call", p.tool(
+            p.tool_open(p.literal("{"))
+            << tool_call_fields
+            << p.zero_or_more( p.literal(",") << tool_call_fields)
+            << p.tool_close(p.literal("}"))
+        ));
+
+        auto tool_calls = p.rule("tool-calls",
+            "<|START_ACTION|>"
+            << ("[" << tool_call << p.zero_or_more(p.literal(",") << tool_call) << "]")
+            << "<|END_ACTION|>");
+
+        return p.optional(thinking) << (tool_calls | response) + p.end();
+    });
+
+    auto test_current = [&](const common_peg_arena & p, const std::string & input, bool is_partial, bool print_results) {
+        common_peg_parse_context ctx(input, is_partial);
+        auto result = p.parse(ctx);
+
+        common_chat_msg msg;
+        auto mapper = common_chat_peg_native_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+
+        if (print_results) {
+            std::cout << "== Parsed (new) ==\n";
+            std::cout << "=== Reasoning ===\n";
+            std::cout << msg.reasoning_content << "\n";
+            std::cout << "\n\n=== Content ===\n";
+            std::cout << msg.content << "\n";
+            std::cout << "\n\n=== Tool Calls ===\n";
+            for (const auto & tc : msg.tool_calls) {
+                std::cout << "id: " << tc.id << "\n";
+                std::cout << "name: " << tc.name << "\n";
+                std::cout << "args: " << tc.arguments << "\n";
+            }
+        }
+    };
+
+    auto test_legacy = [&](const std::string & input, bool need_more_input, bool print_results) {
+        // Original common_chat_combinator_parser taken from chat.cpp
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_GENERIC;
+        params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = false;
+        common_chat_msg_parser builder(
+            input,
+            /* .is_partial = */ need_more_input,
+            params
+        );
+
+        builder.try_parse_reasoning("<|START_THINKING|>", "<|END_THINKING|>");
+
+        static const common_regex start_action_regex("<\\|START_ACTION\\|>");
+        static const common_regex end_action_regex("<\\|END_ACTION\\|>");
+        static const common_regex start_response_regex("<\\|START_RESPONSE\\|>");
+        static const common_regex end_response_regex("<\\|END_RESPONSE\\|>");
+
+        if (auto res = builder.try_find_regex(start_action_regex)) {
+            // If we didn't extract thoughts, prelude includes them.
+            auto tool_calls = builder.consume_json_with_dumped_args({ { "parameters" } });
+            for (const auto & tool_call : tool_calls.value) {
+                std::string name      = tool_call.contains("tool_name") ? tool_call.at("tool_name") : "";
+                std::string id        = tool_call.contains("tool_call_id") ? tool_call.at("tool_call_id") : "";
+                std::string arguments = tool_call.contains("parameters") ? tool_call.at("parameters") : "";
+                if (!builder.add_tool_call(name, id, arguments) || tool_calls.is_partial) {
+                    throw common_chat_msg_partial_exception("incomplete tool call");
+                }
+            }
+            if (tool_calls.is_partial) {
+                throw common_chat_msg_partial_exception("incomplete tool call");
+            }
+            builder.consume_regex(end_action_regex);
+        } else if (auto res = builder.try_find_regex(start_response_regex)) {
+            if (!builder.try_find_regex(end_response_regex)) {
+                builder.add_content(builder.consume_rest());
+                throw common_chat_msg_partial_exception(end_response_regex.str());
+            }
+        } else {
+            builder.add_content(builder.consume_rest());
+        }
+
+        if (print_results) {
+            std::cout << "== Parsed (legacy) ==\n";
+            std::cout << "=== Reasoning ===\n";
+            std::cout << builder.result().reasoning_content << "\n";
+            std::cout << "\n\n=== Content ===\n";
+            std::cout << builder.result().content << "\n";
+            std::cout << "\n\n=== Tool Calls ===\n";
+            for (const auto & tc : builder.result().tool_calls) {
+                std::cout << "id: " << tc.id << "\n";
+                std::cout << "name: " << tc.name << "\n";
+                std::cout << "args: " << tc.arguments << "\n";
+            }
+        }
+    };
+
+    std::string reasoning = "To plan an effective trip to Japan that includes both historical sites and modern attractions within a "
+            "budget of $4000 for a two-week stay, we need to:\n\n"
+            "1. Identify key historical sites and modern attractions in Japan.\n"
+            "2. Find affordable accommodation options that provide a balance between comfort and cost.\n"
+            "3. Determine the best modes of transportation for getting around Japan.\n"
+            "4. Create a day-by-day itinerary that ensures the user gets to see a variety of attractions without "
+            "overspending.\n"
+            "5. Provide a detailed cost breakdown that includes accommodation, transportation, meals, and entry fees "
+            "to attractions.";
+
+    std::vector<std::tuple<std::string, std::string, nlohmann::json>> tool_calls = {{
+        "call_0",
+        "plan_trip",
+        nlohmann::json::parse(R"({
+            "destination": "Japan",
+            "duration": 14,
+            "budget": 4000,
+            "interests": ["historical sites", "modern attractions"],
+            "accommodation_preferences": "affordable",
+            "transportation_preferences": "efficient",
+            "meal_preferences": "local cuisine"
+        })")
+    }};
+
+    std::vector<std::string> tokens;
+
+    // Build tokens
+    if (!reasoning.empty()) {
+        auto tokenized = simple_tokenize(reasoning);
+        tokens.emplace_back("<|START_THINKING|>");
+        tokens.insert(tokens.end(), tokenized.begin(), tokenized.end());
+        tokens.emplace_back("<|END_THINKING|>");
+    }
+
+    if (!tool_calls.empty()) {
+        tokens.emplace_back("<|START_ACTION|>");
+
+        auto json = nlohmann::json::array();
+        for (const auto & tc : tool_calls) {
+            auto tc_json = nlohmann::json::object();
+            tc_json["tool_call_id"] = std::get<0>(tc);
+            tc_json["tool_name"] = std::get<1>(tc);
+            tc_json["parameters"] = std::get<2>(tc);
+            json.push_back(tc_json);
+        }
+
+        auto tokenized = simple_tokenize(json.dump(-1, ' ', true));
+        tokens.insert(tokens.end(), tokenized.begin(), tokenized.end());
+
+        tokens.emplace_back("<|END_ACTION|>");
+    }
+
+    std::string input = std::accumulate(tokens.begin(), tokens.end(), std::string());
+
+    // Run tests
+    t.test("legacy_parse", [&](testing & /* t */) {
+        test_legacy(input, false, false);
+    });
+
+    t.test("current_parse", [&](testing & /* t */) {
+        test_current(parser, input, false, false);
+    });
+
+    // Run benchmarks
+    t.bench("legacy_parse_benchmark complete", [&]() {
+        test_legacy(input, false, false);
+    });
+
+    t.bench("legacy_parse_benchmark incremental", [&]() {
+        std::string in;
+        for (auto i = 0u; i < tokens.size(); i++) {
+            in += tokens[i];
+
+            try {
+                test_legacy(in, i + 1 < tokens.size(), false);
+            } catch (common_chat_msg_partial_exception & /* e */) {
+                // Do nothing, this is expected
+            }
+        }
+    }, 20);
+
+    t.bench("current_parse_benchmark complete", [&]() {
+        test_current(parser, input, false, false);
+    }, 100);
+
+    t.bench("current_parse_benchmark incremental", [&]() {
+        std::string in;
+        for (auto i = 0u; i < tokens.size(); i++) {
+            in += tokens[i];
+            test_current(parser, in, i + 1 < tokens.size(), false);
+        }
+    }, 20);
+}
diff --git a/llama.cpp/tests/test-chat-template.cpp b/llama.cpp/tests/test-chat-template.cpp
new file mode 100644
index 0000000..27b537a
--- /dev/null
+++ b/llama.cpp/tests/test-chat-template.cpp
@@ -0,0 +1,680 @@
+#include <string>
+#include <vector>
+#include <sstream>
+#include <regex>
+#include <iostream>
+#include <fstream>
+#include <filesystem>
+
+#include <nlohmann/json.hpp>
+
+#undef NDEBUG
+#include <cassert>
+
+#include "llama.h"
+#include "common.h"
+#include "chat.h"
+#include "jinja/runtime.h"
+#include "jinja/parser.h"
+#include "jinja/lexer.h"
+#include "jinja/caps.h"
+
+using json = nlohmann::ordered_json;
+
+int main_automated_tests(void);
+
+void run_multiple(std::string dir_path, bool stop_on_first_failure, json input, bool use_common = false);
+void run_single(std::string contents, json input, bool use_common = false, const std::string & output_path = "");
+
+
+
+std::string HELP = R"(
+Usage: test-chat-template [OPTIONS] PATH_TO_TEMPLATE
+Options:
+  -h, --help               Show this help message and exit.
+  --json <path>            Path to the JSON input file.
+  --stop-on-first-fail     Stop testing on the first failure (default: false).
+  --no-common              Use direct Jinja engine instead of common chat templates (default: use common).
+  --output <path>          Path to output results (only for single template runs).
+If PATH_TO_TEMPLATE is a file, runs that single template.
+If PATH_TO_TEMPLATE is a directory, runs all .jinja files in that directory.
+If PATH_TO_TEMPLATE is omitted, runs automated tests (default CI mode).
+)";
+
+std::string DEFAULT_JSON = R"({
+    "messages": [
+        {
+            "role": "user",
+            "content": "Hello, how are you?"
+        },
+        {
+            "role": "assistant",
+            "content": "I am fine, thank you!"
+        }
+    ],
+    "bos_token": "<s>",
+    "eos_token": "</s>",
+    "add_generation_prompt": true
+})";
+
+int main(int argc, char ** argv) {
+    std::vector<std::string> args(argv, argv + argc);
+
+    std::string tmpl_path;
+    std::string json_path;
+    std::string output_path;
+    bool stop_on_first_fail = false;
+    bool use_common = true;
+
+    for (size_t i = 1; i < args.size(); i++) {
+        if (args[i] == "--help" || args[i] == "-h") {
+            std::cout << HELP << "\n";
+            return 0;
+        } else if (args[i] == "--json" && i + 1 < args.size()) {
+            json_path = args[i + 1];
+            i++;
+        } else if (args[i] == "--stop-on-first-fail") {
+            stop_on_first_fail = true;
+        } else if (args[i] == "--output" && i + 1 < args.size()) {
+            output_path = args[i + 1];
+            i++;
+        } else if (args[i] == "--no-common") {
+            use_common = true;
+        } else if (tmpl_path.empty()) {
+            tmpl_path = args[i];
+        } else {
+            std::cerr << "Unknown argument: " << args[i] << "\n";
+            std::cout << HELP << "\n";
+            return 1;
+        }
+    }
+
+    if (tmpl_path.empty()) {
+        return main_automated_tests();
+    }
+
+    json input_json;
+    if (!json_path.empty()) {
+        std::ifstream json_file(json_path);
+        if (!json_file) {
+            std::cerr << "Error: Could not open JSON file: " << json_path << "\n";
+            return 1;
+        }
+        std::string content = std::string(
+            std::istreambuf_iterator<char>(json_file),
+            std::istreambuf_iterator<char>());
+        input_json = json::parse(content);
+    } else {
+        input_json = json::parse(DEFAULT_JSON);
+    }
+
+    std::filesystem::path p(tmpl_path);
+    if (std::filesystem::is_directory(p)) {
+        run_multiple(tmpl_path, stop_on_first_fail, input_json, use_common);
+    } else if (std::filesystem::is_regular_file(p)) {
+        std::ifstream infile(tmpl_path);
+        std::string contents = std::string(
+            std::istreambuf_iterator<char>(infile),
+            std::istreambuf_iterator<char>());
+        run_single(contents, input_json, use_common, output_path);
+    } else {
+        std::cerr << "Error: PATH_TO_TEMPLATE is not a valid file or directory: " << tmpl_path << "\n";
+        return 1;
+    }
+
+    return 0;
+}
+
+void run_multiple(std::string dir_path, bool stop_on_first_fail, json input, bool use_common) {
+    std::vector<std::string> failed_tests;
+
+    // list all files in models/templates/ and run each
+    size_t test_count = 0;
+
+    for (const auto & entry : std::filesystem::directory_iterator(dir_path)) {
+        // only process .jinja files
+        if (entry.path().extension() == ".jinja" && entry.is_regular_file()) {
+            test_count++;
+            std::cout << "\n\n=== RUNNING TEMPLATE FILE: " << entry.path().string() << " ===\n";
+            std::ifstream infile(entry.path());
+            std::string contents((std::istreambuf_iterator<char>(infile)), std::istreambuf_iterator<char>());
+            try {
+                run_single(contents, input, use_common);
+            } catch (const std::exception & e) {
+                std::cout << "Exception: " << e.what() << "\n";
+                std::cout << "=== ERROR WITH TEMPLATE FILE: " << entry.path().string() << " ===\n";
+                failed_tests.push_back(entry.path().string());
+                if (stop_on_first_fail) {
+                    break;
+                }
+            }
+        }
+    }
+
+    std::cout << "\n\n=== TEST SUMMARY ===\n";
+    std::cout << "Total tests run: " << test_count << "\n";
+    std::cout << "Total failed tests: " << failed_tests.size() << "\n";
+    for (const auto & test : failed_tests) {
+        std::cout << "FAILED TEST: " << test << "\n";
+    }
+}
+
+
+static std::string normalize_newlines(const std::string & s) {
+#ifdef _WIN32
+  static const std::regex nl_regex("\r\n");
+  return std::regex_replace(s, nl_regex, "\n");
+#else
+  return s;
+#endif
+}
+
+
+static std::string format_using_common(
+            const std::string & template_str,
+            const std::string & bos_token,
+            const std::string & eos_token,
+            std::vector<common_chat_msg> & messages,
+            std::vector<common_chat_tool> tools = {}) {
+    auto tmpls = common_chat_templates_init(/* model= */ nullptr, template_str, bos_token, eos_token);
+    common_chat_templates_inputs inputs;
+    inputs.use_jinja = true;
+    inputs.messages = messages;
+    inputs.tools = tools;
+    inputs.add_generation_prompt = true;
+    auto output = common_chat_templates_apply(tmpls.get(), inputs).prompt;
+    output = normalize_newlines(output);
+    return output;
+}
+
+
+// skip libcommon, use direct jinja engine
+static jinja::value_string format_using_direct_engine(
+            const std::string & template_str,
+            json & input) {
+    // lexing
+    jinja::lexer lexer;
+    auto lexer_res = lexer.tokenize(template_str);
+
+    // compile to AST
+    jinja::program ast = jinja::parse_from_tokens(lexer_res);
+
+    // check caps for workarounds
+    jinja::caps_get(ast);
+
+    std::cout << "\n=== RUN ===\n";
+    jinja::context ctx(template_str);
+
+    jinja::global_from_json(ctx, input, true);
+
+    jinja::runtime runtime(ctx);
+    const jinja::value results = runtime.execute(ast);
+    auto parts = runtime.gather_string_parts(results);
+
+    std::cout << "\n=== RESULTS ===\n";
+    for (const auto & part : parts->as_string().parts) {
+        std::cout << (part.is_input ? "DATA" : "TMPL") << ": " << part.val << "\n";
+    }
+
+    return parts;
+}
+
+
+void run_single(std::string contents, json input, bool use_common, const std::string & output_path) {
+    jinja::enable_debug(true);
+
+    jinja::value_string output_parts;
+
+    if (use_common) {
+        std::string bos_token = "<s>";
+        std::string eos_token = "</s>";
+        if (input.contains("bos_token")) {
+            bos_token = input["bos_token"].get<std::string>();
+        }
+        if (input.contains("eos_token")) {
+            eos_token = input["eos_token"].get<std::string>();
+        }
+        nlohmann::ordered_json msgs_json = input["messages"];
+        nlohmann::ordered_json tools_json = input["tools"];
+        auto messages = common_chat_msgs_parse_oaicompat(msgs_json);
+        auto tools = common_chat_tools_parse_oaicompat(tools_json);
+        auto output = format_using_common(contents, bos_token, eos_token, messages, tools);
+        std::cout << "\n=== OUTPUT ===\n";
+        std::cout << output << "\n";
+        output_parts = jinja::mk_val<jinja::value_string>(output);
+
+    } else {
+        output_parts = format_using_direct_engine(contents, input);
+        std::cout << "\n=== OUTPUT ===\n";
+        std::cout << output_parts->as_string().str() << "\n";
+    }
+
+    if (!output_path.empty()) {
+        std::ofstream outfile(output_path);
+        if (!outfile) {
+            throw std::runtime_error("Could not open output file: " + output_path);
+        }
+        outfile << output_parts->as_string().str();
+        outfile.close();
+        std::cout << "\n=== OUTPUT WRITTEN TO " << output_path << " ===\n";
+    }
+}
+
+
+
+
+
+//
+// Automated tests for chat templates
+//
+
+#define U8C(x) (const char*)(u8##x)
+
+static common_chat_msg simple_msg(const std::string & role, const std::string & content) {
+    common_chat_msg msg;
+    msg.role = role;
+    msg.content = content;
+    return msg;
+}
+
+int main_automated_tests(void) {
+    // jinja::enable_debug(true);
+
+    std::vector<llama_chat_message> conversation {
+        {"system", "You are a helpful assistant"},
+        {"user", "Hello"},
+        {"assistant", "Hi there"},
+        {"user", "Who are you"},
+        {"assistant", "   I am an assistant   "},
+        {"user", "Another question"},
+    };
+
+    // std::string wrong = /* .template_str= */ u8"[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n......{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}";
+    struct TestCase {
+        std::string name;
+        std::string template_str;
+        std::string expected_output;
+        std::string expected_output_jinja;
+        std::string bos_token = "";
+        std::string eos_token = "";
+        bool supported_with_jinja = true;
+    };
+    std::vector<TestCase> test_cases {
+        {
+            /* .name= */ "teknium/OpenHermes-2.5-Mistral-7B",
+            /* .template_str= */ "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+            /* .expected_output= */ "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "mistralai/Mistral-7B-Instruct-v0.2 (NOTE: Old pre-v1 without a system prompt)",
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+            /* .expected_output= */ "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "TheBloke/FusionNet_34Bx2_MoE-AWQ",
+            /* .template_str= */ "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}",
+            /* .expected_output= */       "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST]Hi there</s><s>[INST] Who are you [/INST]   I am an assistant   </s><s>[INST] Another question [/INST]",
+            /* .expected_output_jinja= */ "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
+            /* .bos_token= */ "<s>",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "bofenghuang/vigogne-2-70b-chat",
+            /* .template_str= */ "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+            /* .expected_output= */       "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST]Hi there</s>[INST] Who are you [/INST]I am an assistant</s>[INST] Another question [/INST]",
+            /* .expected_output_jinja= */ "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "mlabonne/AlphaMonarch-7B",
+            /* .template_str= */ "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
+            /* .expected_output= */ "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
+            /* .expected_output_jinja= */ "<s>system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
+            /* .bos_token= */ "<s>",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "google/gemma-7b-it",
+            /* .template_str= */ "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
+            /* .expected_output= */       "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
+            /* .expected_output_jinja= */ "<start_of_turn>user\nYou are a helpful assistant\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
+        },
+        {
+            /* .name= */ "OrionStarAI/Orion-14B-Chat",
+            /* .template_str= */ "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
+            /* .expected_output= */       "Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
+            /* .expected_output_jinja= */ "Human: You are a helpful assistant\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "openchat/openchat-3.5-0106",
+            // The included chat_template differs from the author's suggestions here: https://huggingface.co/openchat/openchat_3.5/discussions/5#65448109b4a3f3a2f486fd9d
+            // So we match against the included template but implement the suggested version.
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
+            /* .expected_output= */                            "You are a helpful assistant<|end_of_turn|>GPT4 Correct User: Hello<|end_of_turn|>GPT4 Correct Assistant: Hi there<|end_of_turn|>GPT4 Correct User: Who are you<|end_of_turn|>GPT4 Correct Assistant:    I am an assistant   <|end_of_turn|>GPT4 Correct User: Another question<|end_of_turn|>GPT4 Correct Assistant:",
+            /* .expected_output_jinja= */ "GPT4 Correct System: You are a helpful assistant<|end_of_turn|>GPT4 Correct User: Hello<|end_of_turn|>GPT4 Correct Assistant: Hi there<|end_of_turn|>GPT4 Correct User: Who are you<|end_of_turn|>GPT4 Correct Assistant:    I am an assistant   <|end_of_turn|>GPT4 Correct User: Another question<|end_of_turn|>GPT4 Correct Assistant:",
+        },
+        {
+            /* .name= */ "deepseek-ai/deepseek-coder-33b-instruct",
+            /* .template_str= */ "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
+            /* .expected_output= */ "You are a helpful assistant### Instruction:\nHello\n### Response:\nHi there\n<|EOT|>\n### Instruction:\nWho are you\n### Response:\n   I am an assistant   \n<|EOT|>\n### Instruction:\nAnother question\n### Response:\n",
+            /* .expected_output_jinja= */ "",
+        },
+        {
+            /* .name= */ "eachadea/vicuna-13b-1.1",
+            // No template included in tokenizer_config.json, so this template likely needs to be manually set.
+            /* .template_str= */ "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '' + message['content'] + '\n\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
+            /* .expected_output= */ "You are a helpful assistant\n\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "Orca-Vicuna",
+            // No template included in tokenizer_config.json, so this template likely needs to be manually set.
+            /* .template_str= */ "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
+            /* .expected_output= */ "SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "CohereForAI/c4ai-command-r-plus",
+            /* .template_str= */ "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
+            /* .expected_output= */ "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+            /* .expected_output_jinja= */ "",
+        },
+        {
+            /* .name= */ "Llama-3",
+            /* .template_str= */ "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
+            /* .expected_output= */ "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+            /* .expected_output_jinja= */ "",
+        },
+        {
+            /* .name= */ "Phi-3-mini",
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+            /* .expected_output= */     "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+            /* .expected_output_jinja= */ "<|user|>\nYou are a helpful assistant\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        },
+        {
+            /* .name= */ "Phi-3-small",
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+            /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+            /* .expected_output_jinja= */ "",
+        },
+        {
+            /* .name= */ "Phi-3-medium",
+            /* .template_str= */ "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+            /* .expected_output= */     "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+            /* .expected_output_jinja= */ "<|user|>\nYou are a helpful assistant\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        },
+        {
+            /* .name= */ "Phi-3-vision",
+            /* .template_str= */ "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
+            /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "ChatGLM3",
+            /* .template_str= */ "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
+            /* .expected_output= */       "[gMASK]sop<|system|>\n You are a helpful assistant<|user|>\n Hello<|assistant|>\n Hi there<|user|>\n Who are you<|assistant|>\n    I am an assistant   <|user|>\n Another question<|assistant|>",
+            /* .expected_output_jinja= */ "[gMASK]sop<|system|>\n You are a helpful assistant<|user|>\n Hello<|assistant|>\n Hi there<|user|>\n Who are you<|assistant|>\n    I am an assistant   <|user|>\n Another question<|assistant|>",
+        },
+        {
+            /* .name= */ "ChatGLM4",
+            /* .template_str= */ U8C("[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n......{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}"),
+            /* .expected_output= */ "[gMASK]<sop><|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>\n",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "GLMEdge",
+            /* .template_str= */ "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}<|assistant|>",
+            /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
+            /* .expected_output_jinja= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "MiniCPM-3B-OpenHermes-2.5-v2-GGUF",
+            /* .template_str= */ U8C("{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}"),
+            /* .expected_output= */ U8C("You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>"),
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "DeepSeek-V2",
+            /* .template_str= */ "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+            /* .expected_output= */ U8C("You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<｜end▁of▁sentence｜>User: Who are you\n\nAssistant:    I am an assistant   <｜end▁of▁sentence｜>User: Another question\n\nAssistant:"),
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "<｜end▁of▁sentence｜>",
+        },
+        {
+            /* .name= */ "ibm-granite/granite-3.0-8b-instruct",
+            /* .template_str= */ "{%- if tools %}\n    {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n    {%- for tool in tools %}\n    {{- tool | tojson(indent=4) }}\n    {%- if not loop.last %}\n        {{- '\n\n' }}\n    {%- endif %}\n    {%- endfor %}\n    {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n    {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'user' %}\n    {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'assistant' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'assistant_tool_call' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'tool_response' %}\n    {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- endif %}\n    {%- if loop.last and add_generation_prompt %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n    {%- endif %}\n{%- endfor %}",
+            /* .expected_output= */       "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>   I am an assistant   <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>",
+            /* .expected_output_jinja= */ "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>   I am an assistant   <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>",
+        },
+        {
+            /* .name= */ "mistralai/Mistral-7B-Instruct-v0.2 (mistralai 'v1' template with a system prompt)",
+            /* .template_str= */ "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n",
+            /* .expected_output= */ " [INST] You are a helpful assistant\n\nHello [/INST] Hi there</s> [INST] Who are you [/INST]    I am an assistant   </s> [INST] Another question [/INST]",
+            /* .expected_output_jinja= */ " [INST] You are a helpful assistant\n\nHello [/INST] Hi there</s> [INST] Who are you [/INST]    I am an assistant   </s> [INST] Another question [/INST]",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "Mistral-Large-Instruct-2407 (mistralai 'v3' template; modified to have system prompt at start)",
+            /* .template_str= */ "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS] [\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST] \" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- \"[TOOL_CALLS] [\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- \" \" + message[\"content\"]|trim + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+            /* .expected_output= */       "[INST] You are a helpful assistant\n\nHello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant</s>[INST] Another question[/INST]",
+            /* .expected_output_jinja= */ "[INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant</s>[INST] You are a helpful assistant\n\nAnother question[/INST]",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "Mistral-Nemo-Instruct-2407 (mistralai 'v3-tekken' template; modified to have system prompt at start)",
+            /* .template_str= */ "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS][\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST]\" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n        {{- \"[TOOL_CALLS][\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- message[\"content\"] + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+            /* .expected_output= */       "[INST]You are a helpful assistant\n\nHello[/INST]Hi there</s>[INST]Who are you[/INST]   I am an assistant   </s>[INST]Another question[/INST]",
+            /* .expected_output_jinja= */ "[INST]Hello[/INST]Hi there</s>[INST]Who are you[/INST]   I am an assistant   </s>[INST]You are a helpful assistant\n\nAnother question[/INST]",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)",
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
+            /* .expected_output= */ "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST]    I am an assistant   </s>[INST] Another question[/INST]",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "ai-sage/GigaChat-20B-A3B-instruct",
+            /* .template_str= */ "{% if messages[0]['role'] == 'system' -%}\n    {%- set loop_messages = messages[1:] -%}\n    {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n    {%- set loop_messages = messages -%}\n    {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n    {% endif %}\n    \n    {%- if loop.index0 == 0 -%}\n        {{ system_message -}}\n    {%- endif -%}\n    {%- if message['role'] == 'user' -%}\n        {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n        {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3]  + additional_special_tokens[1] -}}\n    {%- endif -%}\n    {%- if message['role'] == 'assistant' -%}\n        {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n    {%- endif -%}\n    {%- if loop.last and add_generation_prompt -%}\n        {{ 'assistant' + additional_special_tokens[0] -}}\n    {%- endif -%}\n{%- endfor %}",
+            /* .expected_output= */ "<s>You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>   I am an assistant   <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+            /* .supported_with_jinja= */ false, // Requires additional_special_tokens as extra context
+        },
+        {
+            /* .name= */ "Infinigence/Megrez-3B-Instruct",
+            /* .template_str= */ U8C("{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|role_start|>system<|role_end|>你是Megrez-3B-Instruct，将针对用户的问题给出详细的、积极的回答。<|turn_end|>' }}{% endif %}{{ '<|role_start|>' + message['role'] + '<|role_end|>' + message['content'] + '<|turn_end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|role_start|>assistant<|role_end|>' }}{% endif %}"),
+            /* .expected_output= */ "<|role_start|>system<|role_end|>You are a helpful assistant<|turn_end|><|role_start|>user<|role_end|>Hello<|turn_end|><|role_start|>assistant<|role_end|>Hi there<|turn_end|><|role_start|>user<|role_end|>Who are you<|turn_end|><|role_start|>assistant<|role_end|>   I am an assistant   <|turn_end|><|role_start|>user<|role_end|>Another question<|turn_end|><|role_start|>assistant<|role_end|>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "phi-4",
+            /* .template_str= */ "{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|><|im_start|>assistant<|im_sep|>'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}",
+            /* .expected_output= */ "<|im_start|>system<|im_sep|>You are a helpful assistant<|im_end|><|im_start|>user<|im_sep|>Hello<|im_end|><|im_start|>assistant<|im_sep|>Hi there<|im_end|><|im_start|>user<|im_sep|>Who are you<|im_end|><|im_start|>assistant<|im_sep|>   I am an assistant   <|im_end|><|im_start|>user<|im_sep|>Another question<|im_end|><|im_start|>assistant<|im_sep|>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "yandex/YandexGPT-5-Lite-8B-instruct",
+            /* .template_str= */ "<s>{%- set names = {'assistant': ' Ассистент:', 'user': ' Пользователь:'} %}\n{%- set tools_prefix = 'Тебе доступны следующие функции:' %}\n{%- macro __render_tool(tool) %}\n    {%- set name = tool.function.name %}\n    {%- set description = tool.function.description|default('') %}\n    {%- set parameters = tool.function.parameters|tojson %}\n    {{- '\\n' }}function {{ '{' }}'name':'{{ name }}',\n    {%- if tool.function.description %}'description':'{{ description }}',{% endif %}\n'parameters':{{ parameters }}\n    {{- '}' }}\n{%- endmacro %}\n{%- macro __render_tools(tools) %}\n    {{- tools_prefix }}\n    {%- for tool in tools %}\n        {{- __render_tool(tool) }}\n    {%- endfor %}\n    {{- '\\n\\n' }}\n{%- endmacro %}\n{%- macro __render_tool_message(message) %}\n    {{- '\\n\\nРезультат вызова' }} {{ message.name }}: {{ message.content }} {{ '\\n\\n' }}\n{%- endmacro %}\n{%- if tools -%}\n    {{- __render_tools(tools) }}\n{%- endif -%}\n{%- macro __render_user_message(message) %}\n{{ names.user }} {{ message.content + '\\n\\n' }}\n{%- endmacro %}\n{%- macro __render_assistant_message(message) %}\n    {{- names.assistant }}\n    {%- set call = message['function_call'] %}\n    {%- if call %}\n        {{- '\\n[TOOL_CALL_START]' }}{{ call.name }}{{ '\\n' }}{{ call.arguments|tojson }}\n    {%- else %}\n        {{- ' ' + message.content + '\\n\\n' }}\n    {%- endif %}\n{%- endmacro %}\n{%- if not add_generation_prompt is defined %}\n{%- set add_generation_prompt = false %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- __render_user_message(message) }}\n    {%- endif %}\n    {%- if message.role == 'assistant' and not loop.last %}\n        {{- __render_assistant_message(message) }}\n    {%- endif %}\n    {%- if message.role == 'tool' %}\n        {{- __render_tool_message(message) }}\n    {%- endif %}\n    {%- if loop.last %}\n        {{- ' Ассистент:[SEP]' }}\n    {%- endif %}\n{%- endfor %}\n",
+            /* .expected_output= */ " Пользователь: Hello\n\n Ассистент: Hi there\n\n Пользователь: Who are you\n\n Ассистент:    I am an assistant   \n\n Пользователь: Another question\n\n Ассистент:[SEP]",
+            /* .expected_output_jinja= */ "<s> Пользователь: You are a helpful assistant\nHello\n\n Ассистент: Hi there\n\n Пользователь: Who are you\n\n Ассистент:    I am an assistant   \n\n Пользователь: Another question\n\n Ассистент:[SEP]",
+            /* .bos_token= */ "<s>",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "inclusionAI/Ling-lite",
+            /* .template_str */ "{% for message in messages %}{% set role = message['role'] | lower %}{% if role == 'user' %}{% set role = 'HUMAN' %}{% endif %}{% set role = role | upper %}{{ '<role>' + role + '</role>' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '<role>ASSISTANT</role>' }}{% endif %}",
+            /* .expected_output= */ "<role>SYSTEM</role>You are a helpful assistant<role>HUMAN</role>Hello<role>ASSISTANT</role>Hi there<role>HUMAN</role>Who are you<role>ASSISTANT</role>   I am an assistant   <role>HUMAN</role>Another question<role>ASSISTANT</role>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "ByteDance-Seed/Seed-OSS-36B-Instruct",
+            /* .template_str */ "{# <seed:bos> #}{%- for message in messages %}{%- if message.role in [\"user\", \"system\"] %}{{ bos_token + message.role + \"\\n\" + message.content + eos_token }}{%- elif message.role == \"assistant\" %}{{ bos_token + message.role }}{%- if message.content is defined and message.content is string and message.content|trim|length > 0 %}{{ \"\\n\" + message.content|trim + eos_token }}{%- endif %}{%- else %}{{ bos_token + message.role + \"\\n\" + message.content + eos_token }}{%- endif %}{%- endfor %}{%- if add_generation_prompt %}{{ bos_token + \"assistant\\n\" }}{%- endif %}",
+            /* .expected_output= */ "<seed:bos>system\nYou are a helpful assistant<seed:eos><seed:bos>user\nHello<seed:eos><seed:bos>assistant\nHi there<seed:eos><seed:bos>user\nWho are you<seed:eos><seed:bos>assistant\nI am an assistant<seed:eos><seed:bos>user\nAnother question<seed:eos><seed:bos>assistant\n",
+            /* .expected_output_jinja= */ "<seed:bos>system\nYou are a helpful assistant<seed:eos><seed:bos>user\nHello<seed:eos><seed:bos>assistant\nHi there<seed:eos><seed:bos>user\nWho are you<seed:eos><seed:bos>assistant\nI am an assistant<seed:eos><seed:bos>user\nAnother question<seed:eos><seed:bos>assistant\n",
+            /* .bos_token= */ "<seed:bos>",
+            /* .eos_token= */ "<seed:eos>",
+        }
+    };
+    std::vector<char> formatted_chat(1024);
+    int32_t res;
+
+    // list all supported templates
+    std::vector<const char *> supported_tmpl;
+    res = llama_chat_builtin_templates(nullptr, 0);
+    assert(res > 0);
+    supported_tmpl.resize(res);
+    res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
+    std::cout << "Built-in chat templates:\n";
+    for (auto tmpl : supported_tmpl) {
+        std::cout << "  " << tmpl << "\n";
+    }
+
+    // test invalid chat template
+    res = llama_chat_apply_template("INVALID TEMPLATE", conversation.data(), conversation.size(), true, formatted_chat.data(), formatted_chat.size());
+    assert(res < 0);
+    const auto add_generation_prompt = true;
+
+    for (const auto & test_case : test_cases) {
+        std::cout << "\n\n=== " << test_case.name << " ===\n\n";
+        formatted_chat.resize(1024);
+        res = llama_chat_apply_template(
+            test_case.template_str.c_str(),
+            conversation.data(),
+            conversation.size(),
+            add_generation_prompt,
+            formatted_chat.data(),
+            formatted_chat.size()
+        );
+        formatted_chat.resize(res);
+        std::string output(formatted_chat.data(), formatted_chat.size());
+        if (output != test_case.expected_output) {
+            std::cout << "Expected:\n" << test_case.expected_output << "\n";
+            std::cout << "-------------------------\n";
+            std::cout << "Actual:\n" << output << "\n";
+            std::cout.flush();
+            assert(output == test_case.expected_output);
+        }
+    }
+
+    std::vector<common_chat_msg> messages;
+    for (const auto & msg : conversation) {
+        messages.push_back(simple_msg(msg.role, msg.content));
+    }
+    for (const auto & test_case : test_cases) {
+        if (!test_case.supported_with_jinja) {
+            continue;
+        }
+        std::cout << "\n\n=== " << test_case.name << " (jinja) ===\n\n";
+        try {
+            auto output = format_using_common(
+                                test_case.template_str,
+                                test_case.bos_token,
+                                test_case.eos_token,
+                                messages);
+            auto expected_output = normalize_newlines(test_case.expected_output_jinja.empty() ? test_case.expected_output : test_case.expected_output_jinja);
+            if (output != expected_output) {
+                std::cout << "Template:```\n" << test_case.template_str << "\n```";
+                std::cout << "-------------------------\n";
+                std::cout << "Expected:```\n" << expected_output << "\n```";
+                std::cout << "-------------------------\n";
+                std::cout << "Actual:```\n" << output << "\n```";
+                std::cout.flush();
+                assert(output == expected_output);
+            }
+        } catch (const std::exception & e) {
+            std::cerr << "ERROR: " << e.what() << "\n";
+            assert(false);
+        }
+    }
+
+    // TODO: llama_chat_format_single will be deprecated, remove these tests later
+
+    // test llama_chat_format_single for system message
+    std::cout << "\n\n=== llama_chat_format_single (system message) ===\n\n";
+    std::vector<common_chat_msg> chat2;
+    auto sys_msg = simple_msg("system", "You are a helpful assistant");
+
+    auto fmt_sys = [&](std::string tmpl_str) {
+        auto tmpls = common_chat_templates_init(/* model= */ nullptr, tmpl_str);
+        auto output = common_chat_format_single(tmpls.get(), chat2, sys_msg, false, /* use_jinja= */ false);
+        std::cout << "fmt_sys(" << tmpl_str << ") : " << output << "\n";
+        std::cout << "-------------------------\n";
+        return output;
+    };
+    assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
+    assert(fmt_sys("mistral-v1") == " [INST] You are a helpful assistant\n\n");
+    assert(fmt_sys("mistral-v3") == "[INST] You are a helpful assistant\n\n");
+    assert(fmt_sys("mistral-v3-tekken") == "[INST]You are a helpful assistant\n\n");
+    assert(fmt_sys("mistral-v7") == "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT]");
+    assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
+    assert(fmt_sys("llama2-sys") == "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\n");
+    assert(fmt_sys("mistral") == "[INST] You are a helpful assistant\n"); // for old pre-v1 templates
+    assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
+    assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
+    assert(fmt_sys("gigachat") == "<s>You are a helpful assistant<|message_sep|>");
+
+
+    // test llama_chat_format_single for user message
+    std::cout << "\n\n=== llama_chat_format_single (user message) ===\n\n";
+    chat2.push_back(simple_msg("system", "You are a helpful assistant"));
+    chat2.push_back(simple_msg("user", "Hello"));
+    chat2.push_back(simple_msg("assistant", "I am assistant"));
+    auto new_msg = simple_msg("user", "How are you");
+
+    auto fmt_single = [&](const std::string & tmpl_str) {
+        auto tmpls = common_chat_templates_init(/* model= */ nullptr, tmpl_str.c_str());
+        auto output = common_chat_format_single(tmpls.get(), chat2, new_msg, true, /* use_jinja= */ false);
+        std::cout << "fmt_single(" << tmpl_str << ") : " << output << "\n";
+        std::cout << "-------------------------\n";
+        return output;
+    };
+    assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
+    assert(fmt_single("mistral-v1") == " [INST] How are you [/INST]");
+    assert(fmt_single("mistral-v3") == "[INST] How are you[/INST]");
+    assert(fmt_single("mistral-v3-tekken") == "[INST]How are you[/INST]");
+    assert(fmt_single("mistral-v7") == "[INST] How are you[/INST]");
+    assert(fmt_single("llama2") == "[INST] How are you [/INST]");
+    assert(fmt_single("mistral") == "[INST] How are you [/INST]"); // for old pre-v1 templates
+    assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
+    assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
+    // assert(fmt_single("gigachat") == "user<|role_sep|>How are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>");
+
+    std::cout << "\nOK: All tests passed successfully.\n";
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-chat.cpp b/llama.cpp/tests/test-chat.cpp
new file mode 100644
index 0000000..4378a8d
--- /dev/null
+++ b/llama.cpp/tests/test-chat.cpp
@@ -0,0 +1,4063 @@
+//  Tests chat handling, including grammar generation and parsing for tool calling, for various templates.
+//
+//  Also acts as a CLI to generate a Markdown summary of the formats of Jinja templates,
+//  e.g. given Minja (http://github.com/google/minja) checked out in parent dir:
+//
+//    cmake -B build && cmake --build build --parallel && ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
+//
+#include "chat.h"
+
+#include "log.h"
+
+#include "../src/unicode.h"
+#include "../src/llama-grammar.h"
+
+#include <nlohmann/json.hpp>
+
+#include <fstream>
+#include <iostream>
+#include <functional>
+#include <string>
+
+using json = nlohmann::ordered_json;
+
+static std::ostream & operator<<(std::ostream & os, const common_chat_msg_diff & diff) {
+    os << "{ content_delta: " << diff.content_delta << "; ";
+    os << "reasoning_content_delta: " << diff.reasoning_content_delta << "; ";
+    if (diff.tool_call_index != std::string::npos) {
+        os << "tool_call_index: " << diff.tool_call_index << "; ";
+        os << "tool_call_delta.name: " << diff.tool_call_delta.name << "; ";
+        os << "tool_call_delta.id: " << diff.tool_call_delta.id << "; ";
+        os << "tool_call_delta.arguments: " << diff.tool_call_delta.arguments << "; ";
+    }
+    os << "}";
+    return os;
+}
+// operator<< for vector<common_chat_msg_diff>:
+static std::ostream & operator<<(std::ostream & os, const std::vector<common_chat_msg_diff> & diffs) {
+    os << "[\n";
+    for (const auto & diff : diffs) {
+        os << "  " << diff << ",\n";
+    }
+    os << "]";
+    return os;
+}
+static std::ostream & operator<<(std::ostream & os, const common_chat_msg & msg) {
+    os << "{ role: " << msg.role << "; ";
+    os << "content: " << msg.content << "; ";
+    os << "content_parts: [\n";
+    for (const auto & part : msg.content_parts) {
+        os << "  { type: " << part.type << "; text: " << part.text << " },\n";
+    }
+    os << "]; ";
+    os << "reasoning_content: " << msg.reasoning_content << "; ";
+    os << "tool_calls: [\n";
+    for (const auto & tool_call : msg.tool_calls) {
+        os << "  { name: " << tool_call.name << "; arguments: " << tool_call.arguments << "; id: " << tool_call.id << " },\n";
+    }
+    os << "]";
+    os << "}";
+    return os;
+}
+
+template <class T> static bool equals(const T & expected, const T & actual) {
+    return expected == actual;
+}
+
+static common_chat_msg normalize(const common_chat_msg & msg) {
+    common_chat_msg normalized = msg;
+    for (auto & tool_call : normalized.tool_calls) {
+        try {
+            tool_call.arguments = json::parse(tool_call.arguments).dump();
+        } catch (const std::exception &) {
+            // Do nothing
+        }
+    }
+    return normalized;
+}
+
+
+template <>
+bool equals(const common_chat_msg & expected, const common_chat_msg & actual) {
+    return normalize(expected) == normalize(actual);
+}
+
+template <class T> static void assert_equals(const T & expected, const T & actual) {
+    if (!equals(expected, actual)) {
+        std::cerr << "Expected:```\n" << expected << "\n```" << std::endl;
+        std::cerr << "Actual:```\n" << actual << "\n```" << std::endl;
+        std::cerr << std::flush;
+        throw std::runtime_error("Test failed");
+    }
+}
+
+static std::string read_file(const std::string & path) {
+    std::cerr << "# Reading: " << path << '\n' << std::flush;
+    std::ifstream fs(path, std::ios_base::binary);
+    if (!fs.is_open()) {
+        fs = std::ifstream("../" + path, std::ios_base::binary);
+        if (!fs.is_open()) {
+            throw std::runtime_error("Failed to open file: " + path);
+        }
+    }
+    fs.seekg(0, std::ios_base::end);
+    auto size = fs.tellg();
+    fs.seekg(0);
+    std::string out;
+    out.resize(static_cast<size_t>(size));
+    fs.read(out.data(), static_cast<std::streamsize>(size));
+    return out;
+}
+
+static common_chat_templates_ptr read_templates(const std::string & path) {
+    return common_chat_templates_ptr(common_chat_templates_init(/* model= */ nullptr, read_file(path)));
+}
+
+static std::unique_ptr<llama_grammar> build_grammar(const std::string & grammar_str) {
+    return std::unique_ptr<llama_grammar>(
+        llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0));
+}
+
+// TODO: extract to common helper (copied from test-grammar-integration.cpp)
+static bool match_string(const std::string & input, llama_grammar * grammar) {
+    const auto cpts = unicode_cpts_from_utf8(input);
+
+    auto & stacks_cur = llama_grammar_get_stacks(grammar);
+
+    for (const auto & cpt : cpts) {
+        llama_grammar_accept(grammar, cpt);
+
+        if (stacks_cur.empty()) {
+            // no stacks means that the grammar failed to match at this point
+            return false;
+        }
+    }
+
+    if (std::any_of(stacks_cur.begin(), stacks_cur.end(), [](const auto & stack) { return stack.empty(); })) {
+        // An empty stack means that the grammar has been completed
+        return true;
+    }
+
+    return false;
+}
+
+static std::string renormalize_json(const std::string & json_str) {
+    try {
+        auto json_obj = json::parse(json_str);
+        return json_obj.dump();
+    } catch (const std::exception & e) {
+        std::cerr << "Failed to parse JSON: " << e.what() << '\n';
+        return json_str;
+    }
+}
+static void assert_msg_equals(const common_chat_msg & expected, const common_chat_msg & actual, bool ignore_whitespace_differences = false) {
+    assert_equals(expected.role, actual.role);
+    if (ignore_whitespace_differences) {
+        assert_equals(string_strip(expected.content), string_strip(actual.content));
+    } else {
+        assert_equals(expected.content, actual.content);
+    }
+    assert_equals(expected.content_parts.size(), actual.content_parts.size());
+    for (size_t i = 0; i < expected.content_parts.size(); i++) {
+        const auto & expected_part = expected.content_parts[i];
+        const auto & actual_part   = actual.content_parts[i];
+        assert_equals(expected_part.type, actual_part.type);
+        if (ignore_whitespace_differences) {
+            assert_equals(string_strip(expected_part.text), string_strip(actual_part.text));
+        } else {
+            assert_equals(expected_part.text, actual_part.text);
+        }
+    }
+    if (ignore_whitespace_differences) {
+        assert_equals(string_strip(expected.reasoning_content), string_strip(actual.reasoning_content));
+    } else {
+        assert_equals(expected.reasoning_content, actual.reasoning_content);
+    }
+    assert_equals(expected.tool_calls.size(), actual.tool_calls.size());
+    for (size_t i = 0; i < expected.tool_calls.size(); i++) {
+        const auto & expected_tool_call = expected.tool_calls[i];
+        const auto & actual_tool_call   = actual.tool_calls[i];
+        assert_equals(expected_tool_call.name, actual_tool_call.name);
+        assert_equals(renormalize_json(expected_tool_call.arguments), renormalize_json(actual_tool_call.arguments));
+        assert_equals(expected_tool_call.id, actual_tool_call.id);
+    }
+}
+
+common_chat_tool special_function_tool {
+    /* .name = */ "special_function",
+    /* .description = */ "I'm special",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "arg1": {
+                "type": "integer",
+                "description": "The arg."
+            }
+        },
+        "required": ["arg1"]
+    })",
+};
+common_chat_tool special_function_tool_with_optional_param {
+    /* .name = */ "special_function_with_opt",
+    /* .description = */ "I'm special but have optional stuff",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "arg1": {
+                "type": "integer",
+                "description": "The arg."
+            },
+            "arg2": {
+                "type": "integer",
+                "description": "The optional arg."
+            }
+        },
+        "required": ["arg1"]
+    })",
+};
+common_chat_tool python_tool {
+    /* .name = */ "python",
+    /* .description = */ "an ipython interpreter",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "code": {
+                "type": "string",
+                "description": "Python code to execute."
+            }
+        },
+        "required": ["code"]
+    })",
+};
+common_chat_tool code_interpreter_tool {
+    /* .name = */ "code_interpreter",
+    /* .description = */ "an ipython interpreter",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "code": {
+                "type": "string",
+                "description": "Python code to execute."
+            }
+        },
+        "required": ["code"]
+    })",
+};
+std::vector<common_chat_tool> tools           { special_function_tool, special_function_tool_with_optional_param, python_tool };
+std::vector<common_chat_tool> llama_3_1_tools { special_function_tool, code_interpreter_tool };
+
+struct delta_data {
+    std::string        delta;
+    common_chat_params params;
+};
+
+static common_chat_msg simple_assist_msg(const std::string & content, const std::string & reasoning_content = "", const std::string & tool_name = "", const std::string & arguments = "", const std::string & id = "") {
+    common_chat_msg msg;
+    msg.role = "assistant";
+    msg.content = content;
+    msg.reasoning_content = reasoning_content;
+    if (!tool_name.empty()) {
+        msg.tool_calls.push_back({ tool_name, arguments, id });
+    }
+    return msg;
+}
+
+static delta_data init_delta(const struct common_chat_templates * tmpls, const std::vector<std::string> & end_tokens,
+                             const common_chat_msg & user_message,
+                             const common_chat_msg & delta_message,
+                             const std::vector<common_chat_tool> & tools,
+                             const common_chat_tool_choice & tool_choice) {
+    common_chat_templates_inputs inputs;
+    inputs.parallel_tool_calls = true;
+    inputs.messages.push_back(user_message);
+    inputs.tools       = tools;
+    inputs.tool_choice = tool_choice;
+    auto params_prefix = common_chat_templates_apply(tmpls, inputs);
+
+    inputs.messages.push_back(delta_message);
+    inputs.add_generation_prompt = false;
+    auto params_full             = common_chat_templates_apply(tmpls, inputs);
+
+    std::string prefix = params_prefix.prompt;
+    std::string full   = params_full.prompt;
+
+    if (full == prefix) {
+        throw std::runtime_error("Full message is the same as the prefix");
+    }
+
+    size_t common_prefix_length = 0;
+    for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
+        if (prefix[i] != full[i]) {
+            break;
+        }
+        if (prefix[i] == '<') {
+            // DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
+            // but it removes thinking tags for past messages.
+            // The prefix and full strings diverge at <think> vs. <｜tool▁calls▁begin｜>, we avoid consuming the leading <.
+            continue;
+        }
+        common_prefix_length = i + 1;
+    }
+    auto delta = full.substr(common_prefix_length);
+
+    // Strip end tokens
+    for (const auto & end_token : end_tokens) {
+        // rfind to find the last occurrence
+        auto pos = delta.rfind(end_token);
+        if (pos != std::string::npos) {
+            delta = delta.substr(0, pos);
+            break;
+        }
+    }
+    return { delta, params_full };
+}
+
+/*
+  Applies the template to 1 user message w/ add_generation_prompt=true, then w/ the test message w/ add_generation_prompt=false,
+  gets the diff, removes any end tokens and parses the result w/ the grammar, checking that
+  the parsed message is the same as the test_message
+*/
+static void test_templates(const struct common_chat_templates * tmpls, const std::vector<std::string> & end_tokens,
+                          const common_chat_msg & test_message,
+                          const std::vector<common_chat_tool> & tools = {},
+                          const std::string & expected_delta = "",
+                          bool expect_grammar_triggered = true,
+                          bool test_grammar_if_triggered = true,
+                          common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE,
+                          bool ignore_whitespace_differences = false
+                        ) {
+    common_chat_msg user_message;
+    user_message.role = "user";
+    user_message.content = "Hello, world!";
+
+    for (const auto & tool_choice : std::vector<common_chat_tool_choice> {COMMON_CHAT_TOOL_CHOICE_AUTO, COMMON_CHAT_TOOL_CHOICE_REQUIRED}) {
+        auto data = init_delta(tmpls, end_tokens, user_message, test_message, tools, tool_choice);
+        if (!expected_delta.empty()) {
+            if (ignore_whitespace_differences) {
+                assert_equals(string_strip(expected_delta), string_strip(data.delta));
+            } else {
+                assert_equals(expected_delta, data.delta);
+            }
+        }
+
+        if (expect_grammar_triggered) {
+            // TODO @ngxson : refactor common_chat_parse to avoid passing format/reasoning_format every time
+            common_chat_parser_params params;
+            params.format = data.params.format;
+            params.reasoning_format = reasoning_format;
+            const auto msg = common_chat_parse(data.delta, /* is_partial= */ false, params);
+            assert_msg_equals(test_message, msg, ignore_whitespace_differences);
+        }
+
+        if (!test_message.tool_calls.empty()) {
+            GGML_ASSERT(!data.params.grammar.empty());
+        }
+        if (!data.params.grammar.empty()) {
+            auto grammar = build_grammar(data.params.grammar);
+            if (!grammar) {
+                throw std::runtime_error("Failed to build grammar");
+            }
+            auto earliest_trigger_pos = std::string::npos;
+            auto constrained = data.delta;
+            for (const auto & trigger : data.params.grammar_triggers) {
+                size_t pos = std::string::npos;
+                std::smatch match;
+                switch (trigger.type) {
+                    case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
+                    {
+                        const auto & word = trigger.value;
+                        pos = constrained.find(word);
+                        break;
+                    }
+                    case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
+                    {
+                        const auto & pattern = trigger.value;
+                        if (std::regex_search(constrained, match, std::regex(pattern))) {
+                            pos = match.position(1);
+                        }
+                        break;
+                    }
+                    case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
+                    {
+                        const auto & pattern = trigger.value;
+                        if (std::regex_match(constrained, match, std::regex(pattern))) {
+                            auto mpos = std::string::npos;
+                            for (size_t i = 1; i < match.size(); ++i) {
+                                if (match[i].length() > 0) {
+                                    mpos = match.position(i);
+                                    break;
+                                }
+                            }
+                            if (mpos == std::string::npos) {
+                                mpos = match.position(0);
+                            }
+                            pos = mpos;
+                        }
+                        break;
+                    }
+                    default:
+                        throw std::runtime_error("Unknown trigger type");
+                }
+                if (pos == std::string::npos) {
+                    continue;
+                }
+                if (earliest_trigger_pos == std::string::npos || pos < earliest_trigger_pos) {
+                    earliest_trigger_pos = pos;
+                }
+            }
+            auto grammar_triggered = false;
+            if (earliest_trigger_pos != std::string::npos) {
+                constrained = constrained.substr(earliest_trigger_pos);
+                grammar_triggered = true;
+            }
+            if (data.params.grammar_lazy) {
+                assert_equals(expect_grammar_triggered, grammar_triggered);
+            }
+
+            if (grammar_triggered && test_grammar_if_triggered && !match_string(constrained, grammar.get())) {
+                throw std::runtime_error("Failed to match delta against grammar:\n\n" + data.delta +
+                    "\n\nConstrained: " + constrained +
+                    "\n\nGrammar: " + data.params.grammar);
+            }
+        }
+    }
+}
+
+/**
+ * Test if streaming=true is consistant with streaming=false for given partial parser
+ * Also test if there is any problem with partial message
+ */
+template <typename T>
+static void test_parser_with_streaming(const common_chat_msg & expected, const std::string & raw_message, T parse_msg) {
+    constexpr auto utf8_truncate_safe_len = [](const std::string_view s) -> size_t {
+        auto len = s.size();
+        if (len == 0) return 0;
+        auto i = len;
+        for (size_t back = 0; back < 4 && i > 0; ++back) {
+            --i;
+            unsigned char c = s[i];
+            if ((c & 0x80) == 0) {
+                return len;
+            } else if ((c & 0xC0) == 0xC0) {
+                size_t expected_len = 0;
+                if ((c & 0xE0) == 0xC0) expected_len = 2;
+                else if ((c & 0xF0) == 0xE0) expected_len = 3;
+                else if ((c & 0xF8) == 0xF0) expected_len = 4;
+                else return i;
+                if (len - i >= expected_len) {
+                    return len;
+                } else {
+                    return i;
+                }
+            }
+        }
+        return len - std::min(len, size_t(3));
+    };
+    constexpr auto utf8_truncate_safe_view = [utf8_truncate_safe_len](const std::string_view s) {
+        return s.substr(0, utf8_truncate_safe_len(s));
+    };
+
+    auto merged = simple_assist_msg("");
+    auto last_msg = parse_msg("");
+    for (size_t i = 1; i <= raw_message.size(); ++i) {
+        auto curr_msg = parse_msg(std::string(utf8_truncate_safe_view(std::string_view(raw_message).substr(0, i))));
+        if (curr_msg == simple_assist_msg("")) continue;
+        LOG_INF("Streaming msg: %s\n", common_chat_msgs_to_json_oaicompat({curr_msg}).dump().c_str());
+        for (auto diff: common_chat_msg_diff::compute_diffs(last_msg, curr_msg)) {
+            LOG_INF("Streaming diff: %s\n", common_chat_msg_diff_to_json_oaicompat(diff).dump().c_str());
+            if (!diff.reasoning_content_delta.empty()) {
+                merged.reasoning_content += diff.reasoning_content_delta;
+            }
+            if (!diff.content_delta.empty()) {
+                merged.content += diff.content_delta;
+            }
+            if (diff.tool_call_index != std::string::npos) {
+                if (!diff.tool_call_delta.name.empty()) {
+                    merged.tool_calls.push_back({diff.tool_call_delta.name, "", ""});
+                }
+                if (!diff.tool_call_delta.arguments.empty()) {
+                    GGML_ASSERT(!merged.tool_calls.empty());
+                    merged.tool_calls.back().arguments += diff.tool_call_delta.arguments;
+                }
+            }
+            LOG_INF("Streaming merged: %s\n", common_chat_msgs_to_json_oaicompat({merged}).dump().c_str());
+        }
+        assert_msg_equals(curr_msg, merged, true);
+        last_msg = curr_msg;
+    }
+    assert_msg_equals(expected, parse_msg(raw_message), true);
+    assert_msg_equals(expected, merged, true);
+}
+
+const common_chat_msg message_user {
+    "user",
+    "Hey there!",
+    /* .content_parts = */ {},
+    /* .tool_calls = */ {},
+    /* .reasoning_content = */ "",
+    /* .tool_name = */ "",
+    /* .tool_call_id = */ "",
+};
+
+const common_chat_msg message_user_parts {
+    "user",
+    /* .content = */ "",
+    /* .content_parts = */ {
+        { "text", "Hey" },
+        { "text", "there" },
+    },
+    /* .tool_calls = */ {},
+    /* .reasoning_content = */ "",
+    /* .tool_name = */ "",
+    /* .tool_call_id = */ "",
+};
+
+const common_chat_msg message_assist                              = simple_assist_msg("Hello, world!\nWhat's up?");
+const common_chat_msg message_assist_empty                        = simple_assist_msg("");
+const common_chat_msg message_assist_thoughts_unparsed_deepseek   = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?");
+const common_chat_msg message_assist_thoughts_unparsed_md         = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```");
+const common_chat_msg message_assist_thoughts_unparsed_md_partial = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}");
+
+const common_chat_msg message_assist_thoughts_unparsed_r7b       = simple_assist_msg("<|START_THINKING|>I'm\nthinking<|END_THINKING|>Hello, world!\nWhat's up?");
+const common_chat_msg message_assist_thoughts_unparsed_magistral = simple_assist_msg("[THINK]raisonnement[/THINK]Réponse");
+const common_chat_msg message_assist_thoughts                    = simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking");
+const common_chat_msg message_assist_thoughts_unopened_unparsed  = simple_assist_msg("I'm\nthinking</think>Hello, world!\nWhat's up?");
+const common_chat_msg message_assist_thoughts_no_content         = simple_assist_msg("", "I'm\nthinking");
+const common_chat_msg message_assist_call                        = simple_assist_msg("", "", "special_function", "{\"arg1\": 1}");
+const common_chat_msg message_assist_call_noopt                  = simple_assist_msg("", "", "special_function_with_opt", "{\"arg1\": 1}");
+const common_chat_msg message_assist_call_withopt                = simple_assist_msg("", "", "special_function_with_opt", "{\"arg1\": 1, \"arg2\": 2}");
+const common_chat_msg message_assist_call_content                = simple_assist_msg("Hello, world!\nWhat's up?", "", "special_function", "{\"arg1\":1}");
+const common_chat_msg message_assist_call_empty_args             = simple_assist_msg("", "", "special_function");
+const common_chat_msg message_assist_call_cutoff_args            = simple_assist_msg("", "", "special_function", "{\"arg");
+const common_chat_msg message_assist_call_thoughts               = simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\":1}");
+const common_chat_msg message_assist_call_thoughts_unparsed      = simple_assist_msg("<think>I'm\nthinking</think>\n\n", "", "special_function", "{\"arg1\": 1}");
+const common_chat_msg message_assist_call_thoughts_content       = simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": 1}");
+const common_chat_msg message_assist_call_id                     = simple_assist_msg("", "", "special_function", "{\"arg1\":1}", /* .id = */ "123456789");
+const common_chat_msg message_assist_call_idx                    = simple_assist_msg("", "", "special_function", "{\"arg1\":1}", /* .id = */ "0");
+const common_chat_msg message_assist_thoughts_call_idx           = simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\": 1}", /* id = */ "0");
+const common_chat_msg message_assist_call_python                 = simple_assist_msg("", "", "python", "{\"code\":\"print('hey')\"}");
+const common_chat_msg message_assist_call_python_lines           = simple_assist_msg("", "", "python", "{\"code\":\"# This is a program:\\nprint('hey')\"}");
+const common_chat_msg message_assist_call_python_lines_unclosed  = simple_assist_msg("", "", "python", "{\"code\":\"# This is a program:\\nprint('hey')");
+const common_chat_msg message_assist_call_code_interpreter       = simple_assist_msg("", "", "code_interpreter", "{\"code\":\"print('hey')\"}");
+
+// Use for PEG parser implementations
+struct peg_test_case {
+    common_chat_templates_inputs params;
+    std::string input;
+    common_chat_msg expect;
+};
+
+struct make_peg_parser {
+    common_chat_params params_;
+    common_peg_arena arena_;
+
+    make_peg_parser(common_chat_templates * tmpls, const common_chat_templates_inputs & inputs) {
+        params_ = common_chat_templates_apply(tmpls, inputs);
+        arena_.load(params_.parser);
+    }
+
+    common_chat_msg parse(const std::string & msg, bool is_partial) {
+        common_chat_parser_params parser_params;
+        parser_params.format = params_.format;
+        return common_chat_peg_parse(arena_, msg, is_partial, parser_params);
+    }
+};
+
+static void test_peg_parser(common_chat_templates * tmpls, const std::function<void(peg_test_case &)> & init) {
+    peg_test_case tc;
+    init(tc);
+    if (tc.params.messages.empty()) {
+        tc.params.messages = {message_user};
+    }
+    if (tc.expect.role.empty()) {
+        tc.expect.role = "assistant";
+    }
+
+    auto parser = make_peg_parser(tmpls, tc.params);
+
+    common_chat_msg msg_accum;
+    common_chat_msg msg_prev;
+    msg_accum.role = msg_prev.role = "assistant";
+
+    for (size_t i = 1; i <= tc.input.size(); ++i) {
+        auto is_partial = i < tc.input.size();
+        common_chat_msg msg_current = parser.parse(tc.input.substr(0, i), is_partial);
+
+        for (const auto & diff : common_chat_msg_diff::compute_diffs(msg_prev, msg_current)) {
+            if (!diff.reasoning_content_delta.empty()) {
+                msg_accum.reasoning_content += diff.reasoning_content_delta;
+            }
+            if (!diff.content_delta.empty()) {
+                msg_accum.content += diff.content_delta;
+            }
+            if (diff.tool_call_index != std::string::npos) {
+                if (!diff.tool_call_delta.name.empty()) {
+                    msg_accum.tool_calls.push_back({diff.tool_call_delta.name, "", diff.tool_call_delta.id});
+                }
+                if (!diff.tool_call_delta.arguments.empty()) {
+                    msg_accum.tool_calls.back().arguments += diff.tool_call_delta.arguments;
+                }
+            }
+        }
+        assert_msg_equals(msg_current, msg_accum, true);
+        msg_prev = msg_current;
+    }
+
+    assert_msg_equals(tc.expect, parser.parse(tc.input, false), true);
+    assert_msg_equals(tc.expect, msg_accum, true);
+}
+
+static void test_msgs_oaicompat_json_conversion() {
+    printf("[%s]\n", __func__);
+    std::vector<common_chat_msg> msgs{
+        message_user,
+        message_user_parts,
+        message_assist_call,
+        message_assist_call_thoughts,
+        message_assist_call_thoughts_unparsed,
+        message_assist_call_thoughts_content,
+        message_assist_call_id,
+        message_assist_call_idx,
+        message_assist_call_python,
+        message_assist_call_code_interpreter,
+    };
+    for (const auto & msg : msgs) {
+        auto oai_json = common_chat_msgs_to_json_oaicompat({msg});
+        auto msgs2 = common_chat_msgs_parse_oaicompat(oai_json);
+        assert_equals((size_t) 1, msgs2.size());
+        auto msg2 = msgs2[0];
+        assert_msg_equals(msg, msg2);
+    }
+    assert_equals(
+        std::string(
+            "[\n"
+            "  {\n"
+            "    \"role\": \"user\",\n"
+            "    \"content\": [\n"
+            "      {\n"
+            "        \"type\": \"text\",\n"
+            "        \"text\": \"Hey\"\n"
+            "      },\n"
+            "      {\n"
+            "        \"type\": \"text\",\n"
+            "        \"text\": \"there\"\n"
+            "      }\n"
+            "    ]\n"
+            "  }\n"
+            "]"
+        ),
+        common_chat_msgs_to_json_oaicompat({message_user_parts}).dump(2));
+
+    assert_equals(
+        std::string(
+            "[\n"
+            "  {\n"
+            "    \"role\": \"assistant\",\n"
+            "    \"content\": \"\",\n"
+            "    \"tool_calls\": [\n"
+            "      {\n"
+            "        \"type\": \"function\",\n"
+            "        \"function\": {\n"
+            "          \"name\": \"python\",\n"
+            "          \"arguments\": \"{\\\"code\\\":\\\"print('hey')\\\"}\"\n"
+            "        }\n"
+            "      }\n"
+            "    ]\n"
+            "  }\n"
+            "]"
+        ),
+        common_chat_msgs_to_json_oaicompat({message_assist_call_python}).dump(2));
+
+    auto res = common_chat_msgs_parse_oaicompat(json::parse("[{\"role\": \"assistant\", \"tool_calls\": []}]"));
+    assert_equals<size_t>(1, res.size());
+    assert_equals<std::string>(res[0].role, "assistant");
+    assert_equals(true, res[0].content.empty());
+    assert_equals(true, res[0].tool_calls.empty());
+
+    try {
+        common_chat_msgs_parse_oaicompat(json::parse("[{\"role\": \"assistant\"}]"));
+        throw std::runtime_error("Expected exception");
+    } catch (const std::exception & e) {
+        if (std::string(e.what()).find("'content'") == std::string::npos) {
+            throw std::runtime_error("Expected exception about missing 'content'");
+        }
+    }
+}
+
+static void test_tools_oaicompat_json_conversion() {
+    printf("[%s]\n", __func__);
+    std::vector<common_chat_tool> tools{
+        special_function_tool,
+        python_tool,
+        code_interpreter_tool,
+    };
+
+    for (const auto & tool : tools) {
+        auto oai_json = common_chat_tools_to_json_oaicompat({tool});
+        auto tools2 = common_chat_tools_parse_oaicompat(oai_json);
+        assert_equals((size_t) 1, tools2.size());
+        auto tool2 = tools2[0];
+        assert_equals(tool.name, tool2.name);
+        assert_equals(tool.description, tool2.description);
+        assert_equals(json::parse(tool.parameters).dump(2), json::parse(tool2.parameters).dump(2));
+    }
+
+    assert_equals(
+        std::string(
+            "[\n"
+            "  {\n"
+            "    \"type\": \"function\",\n"
+            "    \"function\": {\n"
+            "      \"name\": \"special_function\",\n"
+            "      \"description\": \"I'm special\",\n"
+            "      \"parameters\": {\n"
+            "        \"type\": \"object\",\n"
+            "        \"properties\": {\n"
+            "          \"arg1\": {\n"
+            "            \"type\": \"integer\",\n"
+            "            \"description\": \"The arg.\"\n"
+            "          }\n"
+            "        },\n"
+            "        \"required\": [\n"
+            "          \"arg1\"\n"
+            "        ]\n"
+            "      }\n"
+            "    }\n"
+            "  }\n"
+            "]"
+        ),
+        common_chat_tools_to_json_oaicompat({special_function_tool}).dump(2));
+
+    {
+        auto tools_no_params = common_chat_tools_parse_oaicompat(json::parse(
+            R"([{"type": "function", "function": {"name": "test_func", "description": "A test"}}])"));
+        assert_equals((size_t) 1, tools_no_params.size());
+        assert_equals(std::string("test_func"), tools_no_params[0].name);
+        assert_equals(std::string("A test"), tools_no_params[0].description);
+        assert_equals(std::string("{}"), tools_no_params[0].parameters);
+    }
+    {
+        auto tools_no_desc = common_chat_tools_parse_oaicompat(json::parse(
+            R"([{"type": "function", "function": {"name": "test_func", "parameters": {"type": "object"}}}])"));
+        assert_equals((size_t) 1, tools_no_desc.size());
+        assert_equals(std::string("test_func"), tools_no_desc[0].name);
+        assert_equals(std::string(""), tools_no_desc[0].description);
+    }
+    {
+        auto tools_minimal = common_chat_tools_parse_oaicompat(json::parse(
+            R"([{"type": "function", "function": {"name": "test_func"}}])"));
+        assert_equals((size_t) 1, tools_minimal.size());
+        assert_equals(std::string("test_func"), tools_minimal[0].name);
+        assert_equals(std::string(""), tools_minimal[0].description);
+        assert_equals(std::string("{}"), tools_minimal[0].parameters);
+    }
+}
+
+// for compat; ref: https://github.com/ggml-org/llama.cpp/pull/18961
+struct test_parser_params {
+    common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE;
+    bool                     reasoning_in_content  = false;
+    bool                     thinking_forced_open  = false;
+    bool                     parse_tool_calls      = true;
+};
+
+static common_chat_msg test_chat_parse(const std::string & input, bool is_partial, const test_parser_params & syntax) {
+    common_chat_parser_params params;
+    params.format               = syntax.format;
+    params.reasoning_format     = syntax.reasoning_format;
+    params.reasoning_in_content = syntax.reasoning_in_content;
+    params.thinking_forced_open = syntax.thinking_forced_open;
+    params.parse_tool_calls     = syntax.parse_tool_calls;
+    return common_chat_parse(input, is_partial, params);
+}
+
+static void test_template_output_parsers() {
+    printf("[%s]\n", __func__);
+
+    common_chat_templates_inputs inputs_no_tools;
+    inputs_no_tools.messages                = {message_user};
+
+    common_chat_templates_inputs inputs_tools;
+    inputs_tools.messages                   = {message_user};
+    inputs_tools.tools                      = {special_function_tool};
+
+    common_chat_templates_inputs inputs_tools_builtin;
+    inputs_tools_builtin.messages           = {message_user};
+    inputs_tools_builtin.tools              = {python_tool};
+
+    {
+        // Not supported yet
+        auto tmpls = read_templates("models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja");
+        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_GENERIC, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+    }
+    {
+        auto tmpls = read_templates("models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja");
+        std::vector<std::string>   end_tokens{ "<|END_OF_TURN_TOKEN|>" };
+
+        for (const auto & inputs : { inputs_no_tools, inputs_tools }) {
+            auto params = common_chat_templates_apply(tmpls.get(), inputs);
+            assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B, params.format);
+            assert_equals(false, params.thinking_forced_open);
+        }
+
+        assert_msg_equals(message_assist,
+            test_chat_parse(
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_COMMAND_R7B}));
+        assert_msg_equals(message_assist,
+            test_chat_parse(
+                "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_COMMAND_R7B}));
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
+                "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
+            test_chat_parse(
+                "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
+                "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ true,
+                    /* .thinking_forced_open = */ false,
+                }));
+        assert_msg_equals(message_assist_thoughts_unparsed_r7b,
+            test_chat_parse(
+                "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
+                "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_COMMAND_R7B}));
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
+                "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts_call_idx,
+            test_chat_parse(
+                "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
+                "<|START_ACTION|>[\n"
+                "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
+                "]<|END_ACTION|>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts_no_content,
+            test_chat_parse(
+                "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
+                "<|START_ACTION|>[\n"
+                "    {\"tool_call_id\": \"0\", \"tool_name\": \"special",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+
+        test_templates(tmpls.get(), end_tokens, message_assist_call_idx, tools,
+                      "<|START_THINKING|><|END_THINKING|>"
+                      "<|START_ACTION|>[\n"
+                      "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
+                      "]<|END_ACTION|>",
+                      /* expect_grammar_triggered= */ true,
+                      /* test_grammar_if_triggered= */ true,
+                      COMMON_REASONING_FORMAT_DEEPSEEK);
+        test_templates(tmpls.get(), end_tokens, message_assist, tools,
+                      "<|START_RESPONSE|>Hello, world!\n"
+                      "What's up?<|END_RESPONSE|>",
+                      /* expect_grammar_triggered= */ false);
+    }
+    // TODO @ngxson : generic tool calls is too costly to maintain, consider removing it in the future
+    {
+        auto tmpls = read_templates("models/templates/google-gemma-2-2b-it.jinja");
+        std::vector<std::string>   end_tokens{ "<end_of_turn>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_GENERIC, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_GENERIC,
+                      common_chat_templates_apply(
+                          read_templates("models/templates/microsoft-Phi-3.5-mini-instruct.jinja").get(),
+                          inputs_tools)
+                          .format);
+
+        // Generic tool calls doesn't generate / parse content-only messages symmetrically.
+
+        assert_equals(
+            simple_assist_msg("{ \"tool_call\" : { \"name\" : \"t"),
+            test_chat_parse(
+                "{ \"tool_call\" : { \"name\" : \"t",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GENERIC,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                    /* .parse_tool_calls = */ false,
+                }));
+        assert_equals(
+            message_assist_empty,
+            test_chat_parse(
+                "{ \"tool_call\" : { \"name\" : \"t",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GENERIC}));
+
+        assert_equals(
+            simple_assist_msg("", "", "puppeteer_screenshot", "{\"name\":\"servethehome_homepage\","),
+            test_chat_parse(
+                R"({"tool_call": {"name": "puppeteer_screenshot", "arguments": {"name": "servethehome_homepage",)",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GENERIC}));
+
+        assert_equals(
+            message_assist_call_empty_args,
+            test_chat_parse(
+                "{ \"tool_call\" : { \"name\" : \"special_function\"",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GENERIC}));
+        assert_equals(
+            message_assist_call_cutoff_args,
+            test_chat_parse(
+                "{ \"tool_call\" : { \"name\" : \"special_function\", \"arguments\" : { \"arg",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GENERIC}));
+
+        assert_msg_equals(message_assist,
+            test_chat_parse(
+                "{\n"
+                "  \"response\": \"Hello, world!\\nWhat's up?\"\n"
+                "}",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_GENERIC}));
+#if 0
+        test_templates(tmpls.get(), end_tokens, message_assist_call_id, tools,
+                      "{\n"
+                      "  \"tool_calls\": [\n"
+                      "    {\n"
+                      "      \"name\": \"special_function\",\n"
+                      "      \"arguments\": {\n"
+                      "        \"arg1\": 1\n"
+                      "      },\n"
+                      "      \"id\": \"123456789\"\n"
+                      "    }\n"
+                      "  ],\n"
+                      "  \"content\": \"\"\n"
+                      "}");
+#endif
+    }
+    {
+        auto tmpls = read_templates("models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja");
+        std::vector<std::string>   end_tokens{ "</s>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_MISTRAL_NEMO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_templates(
+            tmpls.get(), end_tokens, message_assist_call_id, tools,
+            "[TOOL_CALLS][{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, \"id\": \"123456789\"}]");
+    }
+    {
+        assert_msg_equals(
+            simple_assist_msg("Réponse", "raisonnement"),
+            test_chat_parse(
+                message_assist_thoughts_unparsed_magistral.content,
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_MAGISTRAL,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+    }
+    {
+        auto tmpls = read_templates("models/templates/Qwen-QwQ-32B.jinja");
+        std::vector<std::string> end_tokens{ "<|im_end|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+    }
+    {
+        auto tmpls = read_templates("models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja");
+        std::vector<std::string> end_tokens{ "<|im_end|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+        assert_equals(
+            COMMON_CHAT_FORMAT_HERMES_2_PRO,
+            common_chat_templates_apply(
+                read_templates("models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja").get(),
+                inputs_tools)
+                .format);
+        assert_equals(
+            COMMON_CHAT_FORMAT_HERMES_2_PRO,
+            common_chat_templates_apply(
+                read_templates("models/templates/Qwen-Qwen2.5-7B-Instruct.jinja").get(),
+                inputs_tools)
+                .format);
+
+        // Test parsing
+        assert_msg_equals(
+            simple_assist_msg("", "", "python", ""),
+            test_chat_parse(
+                "```json\n"
+                "<function_call> { \"name\" : \"python\"",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            simple_assist_msg("Let's call something\n"),
+            test_chat_parse(
+                "Let's call something\n"
+                "<tool_call>{\"name\"",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(
+            simple_assist_msg("Let's call something\n"),
+            test_chat_parse(
+                "Let's call something\n"
+                "<tool_call>{\"name",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_call_thoughts,
+            test_chat_parse(
+                // QwQ-32B's template adds a trailing <think> if add_generation_prompt
+                "I'm\nthinking</think>\n"
+                "<tool_call>{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "<tool_call>\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</tool_call>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(message_assist_call_content,
+            test_chat_parse(
+                "Hello, world!\nWhat's up?<tool_call>\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</tool_call>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "<function=special_function>{\"arg1\": 1}</function>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "<function name=\"special_function\">\n"
+                "{\"arg1\": 1}\n"
+                "</function>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "<tool>\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</tool>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "<tools>\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</tools>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "<response>\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</response>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "```xml\n"
+                "<response>\n"
+                "    {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</response>\n"
+                "```",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "```xml\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "```",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "```\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "```",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "```\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "```",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "```json\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "```",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "```json\n"
+                "\n"
+                "                    <function_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}} \n"
+                "                    </function_call> \n"
+                "``` ",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "<json>\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</json>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "<xml>\n"
+                "  {\n"
+                "    \"name\": \"special_function\", \"arguments\": {\"arg1\": 1}\n"
+                "  }\n"
+                "</xml>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "<JSON>\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</JSON>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "{\n  \"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+
+        // Test multiple tool calls
+        common_chat_msg message_assist_multiple_calls;
+        message_assist_multiple_calls.role = "assistant";
+        message_assist_multiple_calls.content = "";
+        message_assist_multiple_calls.tool_calls.push_back({"special_function", "{\"arg1\": 1}", ""});
+        message_assist_multiple_calls.tool_calls.push_back({"python", "{\"code\":\"print('hello')\"}", ""});
+
+        assert_msg_equals(
+            message_assist_multiple_calls,
+            test_chat_parse(
+                "<tool_call>\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</tool_call>\n"
+                "<tool_call>\n"
+                "{\"name\": \"python\", \"arguments\": {\"code\":\"print('hello')\"}}\n"
+                "</tool_call>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+
+        assert_msg_equals(
+            message_assist_multiple_calls,
+            test_chat_parse(
+                "<function=special_function>{\"arg1\": 1}</function>\n"
+                "<function=python>{\"code\":\"print('hello')\"}</function>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+
+        assert_msg_equals(
+            simple_assist_msg(
+                "This is not a tool call:",
+                "",
+                "special_function",
+                "{\"arg1\": 1}"),
+            test_chat_parse(
+                "This is not a tool call:\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(message_assist,
+            test_chat_parse(
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        // assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
+        //     test_chat_parse(
+        //         "I'm\nthinking</think>Hello, world!\nWhat's up?",
+        //         COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts_unparsed_md,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ true,
+                    /* .thinking_forced_open = */ false,
+                    /* .parse_tool_calls = */ false,
+                }));
+        assert_msg_equals(message_assist_thoughts_unparsed_md_partial,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ true,
+                    /* .thinking_forced_open = */ false,
+                }));
+        assert_msg_equals(message_assist_thoughts_unopened_unparsed,
+            test_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
+
+        test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+                      "<tool_call>\n"
+                      "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                      "</tool_call>");
+
+        // Test multiple tool calls with template
+        common_chat_msg message_assist_multiple_calls_template;
+        message_assist_multiple_calls_template.role = "assistant";
+        message_assist_multiple_calls_template.content = "";
+        message_assist_multiple_calls_template.tool_calls.push_back({"special_function", "{\"arg1\": 1}", ""});
+        message_assist_multiple_calls_template.tool_calls.push_back({"python", "{\"code\":\"print('test')\"}", ""});
+
+        test_templates(tmpls.get(), end_tokens, message_assist_multiple_calls_template, tools,
+                      "<tool_call>\n"
+                      "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                      "</tool_call>\n"
+                      "<tool_call>\n"
+                      "{\"name\": \"python\", \"arguments\": {\"code\":\"print('test')\"}}\n"
+                      "</tool_call>");
+
+        test_templates(tmpls.get(), end_tokens, message_assist_call_python_lines, tools,
+                      "<tool_call>\n"
+                      "{\"name\": \"python\", \"arguments\": {\"code\":\"# This is a program:\\nprint('hey')\"}}\n"
+                      "</tool_call>");
+        assert_msg_equals(
+            simple_assist_msg("", /* reasoning_content= */ "<tool_call>nah uhg</tool_call>"),
+            test_chat_parse(
+                "<think><tool_call>nah uhg</tool_call>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+    }
+    {
+        auto tmpls = read_templates("models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja");
+        std::vector<std::string>   end_tokens{ "<|eom_id|>", "<|eot_id|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+                      common_chat_templates_apply(tmpls.get(), inputs_tools_builtin).format);
+        assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+                      common_chat_templates_apply(
+                          read_templates("models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja").get(),
+                          inputs_tools_builtin)
+                          .format);
+
+        assert_equals(
+            message_assist_call,
+            test_chat_parse(
+                "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LLAMA_3_X}));
+
+        // test_templates(tmpls.get(), end_tokens, message_assist, tools, R"(?)", /* expect_grammar_triggered= */ false);
+        test_templates(tmpls.get(), end_tokens, message_assist_call_code_interpreter, llama_3_1_tools,
+                      "<|python_tag|>code_interpreter.call(code=\"print('hey')\")");
+        test_templates(tmpls.get(), end_tokens, message_assist_call_python, tools,
+                      "<|python_tag|>python.call(code=\"print('hey')\")");
+        test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+                      "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
+    }
+    {
+        auto tmpls = read_templates("models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja");
+        std::vector<std::string>   end_tokens{ "<|eom_id|>", "<|eot_id|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+
+        test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+                      "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
+    }
+    {
+        auto tmpls = read_templates("models/templates/meetkai-functionary-medium-v3.1.jinja");
+        std::vector<std::string>   end_tokens{ "<|eom_id|>", "<|eot_id|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY,
+                      common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+            common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY,
+                        common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+
+        for (auto is_partial : { false, true }) {
+            assert_equals(
+                message_assist_call,
+                test_chat_parse(
+                    "<function=special_function>{\"arg1\": 1}</function>",
+                    is_partial,
+                    {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1}));
+        }
+
+        assert_equals(
+            message_assist_call,
+            test_chat_parse(
+                "<function=special_function>{\"arg1\": 1}<",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1}));
+
+        test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+                      "<function=special_function>{\"arg1\": 1}</function>");
+    }
+    {
+        auto tmpls = read_templates("models/templates/meetkai-functionary-medium-v3.2.jinja");
+        std::vector<std::string>   end_tokens{ "<|eom_id|>", "<|eot_id|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        assert_msg_equals(
+            simple_assist_msg(
+                "Hello, world!\nnono\nWhat's up?",
+                "",
+                "special_function",
+                "{\"arg1\": 1}"),
+            test_chat_parse(
+                "all\n"
+                "Hello, world!\n"
+                "nono\n"
+                "What's up?>>>special_function\n"
+                "{\"arg1\": 1}\n",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
+        assert_msg_equals(message_assist_call_python_lines,
+            test_chat_parse(
+                "python\n"
+                "# This is a program:\n"
+                "print('hey')",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
+        assert_msg_equals(message_assist_call_python_lines_unclosed,
+            test_chat_parse(
+                "python\n"
+                "# This is a program:\n"
+                "print('hey')",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
+        assert_msg_equals(message_assist_call,
+            test_chat_parse(
+                "special_function\n"
+                "{\"arg1\": 1} \n                    ",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
+        assert_msg_equals(message_assist,
+            test_chat_parse(
+                "all\n"
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
+
+        test_templates(tmpls.get(), end_tokens, message_assist, {},
+                      "all\n"
+                      "Hello, world!\n"
+                      "What's up?",
+                      /* expect_grammar_triggered= */ false);
+        test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+                      "special_function\n"
+                      "{\"arg1\": 1}");
+    }
+    {
+        auto tmpls = read_templates("models/templates/fireworks-ai-llama-3-firefunction-v2.jinja");
+        std::vector<std::string>   end_tokens{ "<|eot_id|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_FIREFUNCTION_V2, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+                      " functools[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]");
+    }
+    {
+        // Original DeepSeek R1 template. Leaves <｜tool▁calls▁begin｜> and others unclosed. Our logic fixes the prompt.
+        auto tmpls = read_templates("models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja");
+        std::vector<std::string>   end_tokens{ "<｜end▁of▁sentence｜>" };
+
+        for (const auto & inputs : { inputs_no_tools, inputs_tools }) {
+            auto params = common_chat_templates_apply(tmpls.get(), inputs);
+            assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1, params.format);
+            assert_equals(true, params.thinking_forced_open);
+        }
+
+        test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        assert_msg_equals(
+            simple_assist_msg("Hello, world!\nWhat's up?", "<think>I'm\nthinking"),
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
+        assert_msg_equals(
+            simple_assist_msg("", "I need to remember the correct syntax. It starts with <｜tool▁calls▁begin｜> and ends with"),
+            test_chat_parse(
+                "I need to remember the correct syntax. It starts with <｜tool▁calls▁begin｜> and ends with",
+                /* is_partial= */ true,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts_unopened_unparsed,
+            test_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
+        assert_msg_equals(message_assist_thoughts,
+            // Latest template update (ast of 20250209) adds a trailing <think>\n if add_generation_prompt is true.
+            test_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
+        // test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+        //               "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
+        //               "```json\n"
+        //               "{\"arg1\": 1}\n"
+        //               // Look what's not here: <｜tool▁calls▁end｜> (also missing the <｜end▁of▁sentence｜>, but that is removed lazily by the test's delta logic)
+        //               "```<｜tool▁call▁end｜>",
+        //               /* expect_grammar_triggered= */ true,
+        //               /* test_grammar_if_triggered= */ false);
+    }
+    {
+        // Replacement DeepSeek R1 template. Makes the Distill Qwen 7B/32B models happy to call tools and all.
+        auto tmpls = read_templates("models/templates/llama-cpp-deepseek-r1.jinja");
+        std::vector<std::string>   end_tokens{ "<｜end▁of▁sentence｜>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1,                   common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1,                   common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_DEEPSEEK_R1}));
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
+
+        assert_msg_equals(message_assist_call_thoughts_unparsed,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>\n\n"
+                "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
+                "```json\n"
+                "{\"arg1\": 1}\n"
+                "```<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_DEEPSEEK_R1}));
+        assert_msg_equals(message_assist_call,
+            test_chat_parse(
+                "<｜tool▁calls｜>function<｜tool▁sep｜>special_function\n"
+                "```json\n"
+                "{\"arg1\": 1}\n"
+                "```<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_DEEPSEEK_R1}));
+
+        assert_msg_equals(message_assist_call_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>\n\n"
+                "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
+                "```json\n"
+                "{\"arg1\": 1}\n"
+                "```<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+                "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
+                "```json\n"
+                "{\"arg1\": 1}\n"
+                "```<｜tool▁call▁end｜><｜tool▁calls▁end｜>");
+    }
+    {
+        auto tmpls = read_templates("models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja");
+        std::vector<std::string> end_tokens{ "<|end_of_text|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_GRANITE, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+
+        assert_equals(COMMON_CHAT_FORMAT_GRANITE, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        // Test parsing regular content
+        assert_msg_equals(message_assist,
+            test_chat_parse(
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_GRANITE}));
+        assert_msg_equals(
+            message_assist,
+            test_chat_parse(
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GRANITE}));
+
+        // Test parsing content with thinking
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GRANITE,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_GRANITE}));
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think><response>Hello, world!\nWhat's up?",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GRANITE,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think><response>Hello, world!\nWhat's up?</response>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GRANITE,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(simple_assist_msg("<think>I'm\nthinking</think><response>Hello, world!\nWhat's up?</response>"),
+            test_chat_parse(
+                "<think>I'm\nthinking</think><response>Hello, world!\nWhat's up?</response>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_GRANITE}));
+        assert_msg_equals(message_assist_empty,
+            test_chat_parse(
+                "<think",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GRANITE,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_empty,
+            test_chat_parse(
+                "<think",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GRANITE}));
+        assert_msg_equals(message_assist_thoughts_no_content,
+            test_chat_parse(
+                "<think>I'm\nthinking",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GRANITE,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(
+            message_assist_empty,
+            test_chat_parse(
+                "<think>I'm\nthinking</think><response",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GRANITE}));
+
+        // Test parsing tool calls
+        assert_msg_equals(message_assist_call,
+            test_chat_parse(
+                "<|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_GRANITE}));
+        assert_msg_equals(
+            message_assist_call_empty_args,
+            test_chat_parse(
+                "<|tool_call|>[{\"name\": \"special_function\"",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GRANITE}));
+        assert_msg_equals(
+            message_assist_call_cutoff_args,
+            test_chat_parse(
+                "<|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GRANITE}));
+        assert_msg_equals(
+            message_assist_call_cutoff_args,
+            test_chat_parse(
+                "<|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GRANITE,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+
+        // Test parsing tool calls with thinking
+        assert_msg_equals(
+            message_assist_call_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think><|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, {",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GRANITE,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+
+        // Test template generation for regular content
+        test_templates(tmpls.get(), end_tokens, message_assist, tools,
+                      "Hello, world!\nWhat's up?",
+                      /* expect_grammar_triggered= */ false);
+    // TODO @ngxson : generic tool call should be removed in the future
+#if 0
+        // Test template generation for tool calls
+        test_templates(tmpls.get(), end_tokens, message_assist_call_id, tools,
+                      "{\n"
+                      "  \"tool_calls\": [\n"
+                      "    {\n"
+                      "      \"name\": \"special_function\",\n"
+                      "      \"arguments\": {\n"
+                      "        \"arg1\": 1\n"
+                      "      },\n"
+                      "      \"id\": \"123456789\"\n"
+                      "    }\n"
+                      "  ],\n"
+                      "  \"content\": \"\"\n"
+                      "}",
+                      /* expect_grammar_triggered= */ false
+        );
+#endif
+    }
+    {
+        auto tmpls = read_templates("models/templates/openai-gpt-oss-120b.jinja");
+        std::vector<std::string> end_tokens{ "<|return|>", "<|call|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_GPT_OSS, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_GPT_OSS, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        assert_msg_equals(simple_assist_msg("", "I'm\nthink"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthink",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+        assert_msg_equals(simple_assist_msg("", "I'm\nthinking"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+        assert_msg_equals(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+        assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant<|channel|>commentary to=functions.special_function <|constrain|>json<|message|>{\"arg1",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+        assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant<|channel|>commentary to=functions.special_function<|message|>{\"arg1",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+        assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\": 1}"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant<|channel|>commentary to=functions.special_function <|constrain|>json<|message|>{\"arg1\": 1}",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+        assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\": 1}"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant<|channel|>analysis to=functions.special_function <|constrain|>json<|message|>{\"arg1\": 1}",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+        assert_msg_equals(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant<|channel|>commentary<|message|>Hello, world!\nWhat's up?",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+        assert_msg_equals(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": 1}"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant<|channel|>commentary<|message|>Hello, world!\nWhat's up?<|end|>"
+                "<|start|>assistant<|channel|>commentary to=functions.special_function <|constrain|>json<|message|>{\"arg1\": 1}",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+
+        // Test parse_tool_calls == false
+        assert_msg_equals(
+            simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ false,
+                    /* .parse_tool_calls = */ false,
+                }));
+        assert_msg_equals(
+            simple_assist_msg("", "I'm\nthinking"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant<|channel|>commentary to=functions.special_function<|message|>{\"arg1",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ false,
+                    /* .parse_tool_calls = */ false,
+                }));
+        assert_msg_equals(
+            simple_assist_msg("", "I'm\nthinking"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant<|channel|>commentary to=functions.special_function <|constrain|>json<|message|>{\"arg1\": 1}",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ false,
+                    /* .parse_tool_calls = */ false,
+                }));
+
+        // Test reasoning formats
+        assert_msg_equals(
+            simple_assist_msg(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>Hello, world!\nWhat's up?"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
+                }));
+
+        assert_msg_equals(
+            simple_assist_msg(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>Hello, world!\nWhat's up?"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                    /* .reasoning_in_content = */ true,
+                }));
+
+        // Test tool calling in role header
+        assert_msg_equals(simple_assist_msg("", "", "special_function", "{\"arg1\": 1}"),
+            test_chat_parse(
+                " to=functions.special_function<|channel|>commentary <|constrain|>json<|message|>{\"arg1\": 1}",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+        assert_msg_equals(simple_assist_msg("", "", "special_function", "{\"arg1\": 1}"),
+            test_chat_parse(
+                " to=functions.special_function<|channel|>analysis <|constrain|>json<|message|>{\"arg1\": 1}",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+        assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\": 1}"),
+            test_chat_parse(
+                "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
+                "<|start|>assistant to=functions.special_function<|channel|>analysis <|constrain|>json<|message|>{\"arg1\": 1}",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GPT_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                }));
+    }
+    {
+        // Seed-OSS format tests
+        auto tmpls = read_templates("models/templates/ByteDance-Seed-OSS.jinja");
+        std::vector<std::string> end_tokens{ "<seed:eos>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_SEED_OSS, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_SEED_OSS, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+
+        // Test simple reasoning content
+        assert_msg_equals(
+            simple_assist_msg("Hello, world!", "I'm thinking about the answer"),
+            test_chat_parse(
+                "<seed:think>I'm thinking about the answer</seed:think>Hello, world!",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_SEED_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+
+        // Test budget reflection tags
+        common_chat_msg msg_budget_reflect;
+        msg_budget_reflect.role = "assistant";
+        msg_budget_reflect.content = "<seed:cot_budget_reflect>Token usage: 45/1000\nI should continue thinking to find the best solution.</seed:cot_budget_reflect>I need to calculate this step by step.";
+        msg_budget_reflect.reasoning_content = "Token usage: 45/1000\nI should continue thinking to find the best solution.";
+        assert_msg_equals(
+            msg_budget_reflect,
+            test_chat_parse(
+                "<seed:think>Token usage: 45/1000\nI should continue thinking to find the best solution.</seed:think>"
+                "<seed:cot_budget_reflect>Token usage: 45/1000\nI should continue thinking to find the best solution.</seed:cot_budget_reflect>"
+                "I need to calculate this step by step.",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_SEED_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+
+        // Test tool calls with Seed-OSS format
+        common_chat_msg msg_tool_call;
+        msg_tool_call.role = "assistant";
+        msg_tool_call.tool_calls.push_back({"calculate_sum", "{\"numbers\": [1, 2, 3]}", ""});
+        assert_msg_equals(
+            msg_tool_call,
+            test_chat_parse(
+                "<seed:tool_call>\n"
+                "<function=calculate_sum>\n"
+                "<parameter=numbers>[1, 2, 3]</parameter>\n"
+                "</function>\n"
+                "</seed:tool_call>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_SEED_OSS}));
+
+        // Test reasoning + tool call combination
+        common_chat_msg msg_reasoning_tool;
+        msg_reasoning_tool.role = "assistant";
+        msg_reasoning_tool.content = "";
+        msg_reasoning_tool.reasoning_content = "I need to calculate the sum of these numbers";
+        msg_reasoning_tool.tool_calls.push_back({"calculate_sum", "{\"numbers\": [1, 2, 3]}", ""});
+        assert_msg_equals(
+            msg_reasoning_tool,
+            test_chat_parse(
+                "<seed:think>I need to calculate the sum of these numbers</seed:think>"
+                "<seed:tool_call>\n"
+                "<function=calculate_sum>\n"
+                "<parameter=numbers>[1, 2, 3]</parameter>\n"
+                "</function>\n"
+                "</seed:tool_call>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_SEED_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+
+        // Test deltas: the number of tool calls in partial parses should never decrease
+        std::string tool_msg = "<seed:tool_call>\n"
+            "<function=fun>\n"
+            "<parameter=smth>[1, 2, 3]</parameter>\n"
+            "</function>";
+        std::size_t previousToolCalls = 0;
+        for (std::size_t i = std::string("<seed:tool_call>").length(); i < tool_msg.length() - 1; i++) {
+            auto partial = tool_msg.substr(0, i);
+            auto partial_res = test_chat_parse(partial, true, { COMMON_CHAT_FORMAT_SEED_OSS, COMMON_REASONING_FORMAT_DEEPSEEK });
+            if (partial_res.tool_calls.size() < previousToolCalls) {
+                throw std::runtime_error("Tool call size decreased on partial: " + partial + " from " + std::to_string(previousToolCalls) + " to " + std::to_string(partial_res.tool_calls.size()));
+            }
+            previousToolCalls = partial_res.tool_calls.size();
+        }
+
+        // Test multiple parameters in tool call
+        common_chat_msg msg_multi_param;
+        msg_multi_param.role = "assistant";
+        msg_multi_param.tool_calls.push_back({"process_data", "{\"input\": \"test\", \"format\": \"json\"}", ""});
+        assert_msg_equals(
+            msg_multi_param,
+            test_chat_parse(
+                "<seed:tool_call>\n"
+                "<function=process_data>\n"
+                "<parameter=input>test</parameter>\n"
+                "<parameter=format>json</parameter>\n"
+                "</function>\n"
+                "</seed:tool_call>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_SEED_OSS}));
+
+        // Test partial parsing for incomplete tool call - don't actually add the call until parsing parameters is done
+        assert_msg_equals(
+            simple_assist_msg("", "", "calculate_sum", "{\"numbers\":"),
+            test_chat_parse(
+                "<seed:tool_call>\n"
+                "<function=calculate_sum>\n"
+                "<parameter=numbers>[1,\n",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_SEED_OSS}));
+
+        // Test incomplete reasoning tag
+        assert_msg_equals(
+            simple_assist_msg("", "I was thinking"),
+            test_chat_parse(
+                "<seed:think>I was thinking",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_SEED_OSS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+
+        // Test content without reasoning
+        assert_msg_equals(
+            simple_assist_msg("This is a simple response without reasoning."),
+            test_chat_parse(
+                "This is a simple response without reasoning.",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_SEED_OSS}));
+    }
+    {
+        auto tmpls = read_templates("models/templates/NVIDIA-Nemotron-Nano-v2.jinja");
+        std::vector<std::string> end_tokens{ "<SPECIAL_12>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_NEMOTRON_V2, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_NEMOTRON_V2, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        // Test parsing regular content
+        assert_msg_equals(message_assist,
+            test_chat_parse(
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_NEMOTRON_V2}));
+
+        // Test parsing content with thinking
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_NEMOTRON_V2,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+
+        // Test parsing tool calls
+        assert_msg_equals(message_assist_call,
+            test_chat_parse(
+                "<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_NEMOTRON_V2}));
+
+        // Test parsing tool calls with thinking
+        assert_msg_equals(message_assist_call_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think><TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>",
+                /* is_partial= */ false,
+                {
+                    /*  .format = */ COMMON_CHAT_FORMAT_NEMOTRON_V2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+                }));
+
+        // Test tool calls with extra content
+        assert_msg_equals(message_assist_call_content,
+            test_chat_parse(
+                "<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_NEMOTRON_V2}
+            ));
+
+        // Test tool calls with extra content AND thinking
+        assert_msg_equals(message_assist_call_thoughts_content,
+            test_chat_parse(
+                "<think>I'm\nthinking</think><TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /*  .format = */ COMMON_CHAT_FORMAT_NEMOTRON_V2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+                }));
+
+        // Test template generation for regular content
+        test_templates(tmpls.get(), end_tokens, message_assist, tools,
+                      "Hello, world!\nWhat's up?\n",
+                      /* expect_grammar_triggered= */ false);
+
+        // Test template generation for tool calls
+        test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+                      "<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>",
+                      /* expect_grammar_triggered= */ true
+        );
+    }
+    {
+        auto tmpls = read_templates("models/templates/deepseek-ai-DeepSeek-V3.1.jinja");
+        std::vector<std::string>   end_tokens{ "<｜end▁of▁sentence｜>" };
+
+        for (const auto & inputs : { inputs_no_tools, inputs_tools }) {
+            auto params = common_chat_templates_apply(tmpls.get(), inputs);
+            assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, params.format);
+            assert_equals(true, params.thinking_forced_open);
+        }
+
+        test_templates(tmpls.get(), end_tokens, message_assist, tools, "</think>Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "</think>Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        assert_msg_equals(
+            simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"),
+            test_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
+        // variant: thinking forced open, reasoning_format none
+        assert_msg_equals(
+            simple_assist_msg("REASONING</think>ok", ""),
+            test_chat_parse(
+                "REASONING</think>ok",
+                /* is_partial= */ false,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                    /* .parse_tool_calls = */ true,
+                }));
+        // variant: happy path for when it works as the model card says it should
+        assert_msg_equals(
+            simple_assist_msg("", "", "get_time", "{\"city\":\"Tokyo\"}"),
+            test_chat_parse(
+                "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
+                /* is_partial= */ false,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ false,
+                    /* .parse_tool_calls = */ true,
+                }));
+        // variant: simple + thinking open
+        assert_msg_equals(
+            simple_assist_msg("", "REASONING", "get_time", "{\"city\":\"Tokyo\"}"),
+            test_chat_parse(
+                "REASONING</think><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
+                /* is_partial= */ false,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                    /* .parse_tool_calls = */ true,
+                }));
+        // variant: simple + multiple tool calls
+        common_chat_msg message_assist_multiple_calls;
+        message_assist_multiple_calls.role = "assistant";
+        message_assist_multiple_calls.content = "CONTENT";
+        message_assist_multiple_calls.tool_calls.push_back({"get_time", "{\"city\":\"Paris\"}", ""});
+        message_assist_multiple_calls.tool_calls.push_back({"get_weather", "{\"city\":\"Paris\"}", ""});
+        assert_msg_equals(
+            message_assist_multiple_calls,
+            test_chat_parse(
+                "CONTENT<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Paris\"}<｜tool▁call▁end｜><｜tool▁call▁begin｜>get_weather<｜tool▁sep｜>{\"city\": \"Paris\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
+                /* is_partial= */ false,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ false,
+                    /* .parse_tool_calls = */ true,
+                }));
+        // variant: thinking forced open + tool call in reasoning content
+        assert_msg_equals(
+            simple_assist_msg("", "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time2<｜tool▁sep｜>{\"city\": \"Tokyo2\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>REASONING", "get_time", "{\"city\":\"Tokyo\"}"),
+            test_chat_parse(
+                "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time2<｜tool▁sep｜>{\"city\": \"Tokyo2\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>REASONING</think><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
+                /* is_partial= */ false,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                    /* .parse_tool_calls = */ true,
+                }));
+        // variant: thinking forced open + tool call in reasoning content + no closing think + not partial
+        //          This is a bit of a fine tuning issue on the model's part IMO. It really should not be attempting
+        //          to make tool calls in reasoning content according to the model card, but it does sometimes, so
+        //          add the reasoning content as regular content and parse the tool calls.
+        assert_msg_equals(
+            simple_assist_msg("REASONING", "", "get_time", "{\"city\":\"Tokyo\"}"),
+            test_chat_parse(
+                "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
+                /* is_partial= */ false,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                    /* .parse_tool_calls = */ true,
+                }));
+        // variant: thinking forced open + tool call in reasoning content + no closing think + partial
+        assert_msg_equals(
+            simple_assist_msg("", "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>", "", ""),
+            test_chat_parse(
+                "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
+                /* is_partial= */ true,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                    /* .parse_tool_calls = */ true,
+                }));
+        // variant: thinking not forced open + missing reasoning + no tool calls
+        assert_msg_equals(
+            simple_assist_msg("CONTENT", ""),
+            test_chat_parse(
+                "CONTENT",
+                /* is_partial= */ false,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ false,
+                    /* .parse_tool_calls = */ true,
+                }));
+    }
+    {
+        auto tmpls = read_templates("models/templates/Apertus-8B-Instruct.jinja");
+        std::vector<std::string> end_tokens{ "<|assistant_end|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_APERTUS, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_APERTUS, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        // Test parsing regular content
+        assert_msg_equals(message_assist,
+            test_chat_parse(
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_APERTUS}));
+
+        // Test parsing content with thinking
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<|inner_prefix|>I'm\nthinking<|inner_suffix|>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_APERTUS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+
+        // Test parsing tool calls
+        assert_msg_equals(message_assist_call,
+            test_chat_parse(
+                "<|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_APERTUS}));
+
+        // Test parsing tool calls with thinking
+        assert_msg_equals(message_assist_call_thoughts,
+            test_chat_parse(
+                "<|inner_prefix|>I'm\nthinking<|inner_suffix|><|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_APERTUS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+                }));
+
+        // Test tool calls with extra content
+        assert_msg_equals(message_assist_call_content,
+            test_chat_parse(
+                "<|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_APERTUS}
+            ));
+
+        // Test tool calls with extra content AND thinking
+        assert_msg_equals(message_assist_call_thoughts_content,
+            test_chat_parse(
+                "<|inner_prefix|>I'm\nthinking<|inner_suffix|><|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_APERTUS,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+                }));
+
+        // Test template generation for regular content
+        test_templates(tmpls.get(), end_tokens, message_assist, tools,
+                      "Hello, world!\nWhat's up?",
+                      /* expect_grammar_triggered= */ false);
+
+        // Test template generation for tool calls
+        test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+                      "<|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>",
+                      /* expect_grammar_triggered= */ true
+        );
+
+        // TODO @ngxson : not sure why this fails, but not very important for now
+        // assert_equals(true, common_chat_templates_support_enable_thinking(tmpls.get()));
+    }
+    {
+        // LFM2 format tests
+        auto tmpls = read_templates("models/templates/llama-cpp-lfm2.jinja");
+        std::vector<std::string> end_tokens{ "<|im_end|>" };
+
+        auto inputs_tools_forced_json_schema = std::invoke([&]() -> common_chat_templates_inputs {
+            common_chat_templates_inputs inputs;
+            inputs.messages = {
+                std::invoke([&]() -> common_chat_msg {
+                    common_chat_msg msg;
+                    msg.role = "system";
+                    msg.content = "force json schema.\n";
+                    return msg;
+                }),
+                message_user,
+            };
+            inputs.tools = {special_function_tool};
+            return inputs;
+        });
+
+        {
+            auto params = common_chat_templates_apply(tmpls.get(), inputs_no_tools);
+            assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, params.format);
+            assert_equals(false, params.grammar_lazy);
+            assert_equals(std::string(R"(<|im_start|>user
+Hey there!<|im_end|>
+<|im_start|>assistant
+)"), params.prompt);
+        }
+
+        {
+            auto params = common_chat_templates_apply(tmpls.get(), inputs_tools);
+            assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, params.format);
+            assert_equals(false, params.grammar_lazy);
+            assert_equals(std::string(R"(<|im_start|>system
+List of tools: <|tool_list_start|>[{"type": "function", "function": {"name": "special_function", "description": "I'm special", "parameters": {"type": "object", "properties": {"arg1": {"type": "integer", "description": "The arg."}}, "required": ["arg1"]}}}]<|tool_list_end|><|im_end|>
+<|im_start|>user
+Hey there!<|im_end|>
+<|im_start|>assistant
+)"), params.prompt);
+            assert_equals(true, params.grammar.empty());
+        }
+
+        {
+            auto params = common_chat_templates_apply(tmpls.get(), inputs_tools_forced_json_schema);
+            assert_equals(COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS, params.format);
+            assert_equals(true, params.grammar_lazy);
+            assert_equals(std::string(R"(<|im_start|>system
+List of tools: <|tool_list_start|>[{"type": "function", "function": {"name": "special_function", "description": "I'm special", "parameters": {"type": "object", "properties": {"arg1": {"type": "integer", "description": "The arg."}}, "required": ["arg1"]}}}]<|tool_list_end|><|im_end|>
+<|im_start|>user
+Hey there!<|im_end|>
+<|im_start|>assistant
+)"), params.prompt);
+            assert_equals(false, params.grammar.empty());
+        }
+
+        // Test parsing regular content
+        assert_msg_equals(message_assist,
+            test_chat_parse(
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
+
+        // Test single tool call with JSON format
+        common_chat_msg msg_single_tool_call;
+        msg_single_tool_call.role = "assistant";
+        msg_single_tool_call.tool_calls.push_back({"special_function", "{\"arg1\":1}", ""});
+        assert_msg_equals(
+            msg_single_tool_call,
+            test_chat_parse(
+                "<|tool_call_start|>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]<|tool_call_end|>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
+
+        // Test tool call with string argument
+        common_chat_msg msg_tool_call_string;
+        msg_tool_call_string.role = "assistant";
+        msg_tool_call_string.tool_calls.push_back({"get_weather", "{\"location\":\"Paris\"}", ""});
+        assert_msg_equals(
+            msg_tool_call_string,
+            test_chat_parse(
+                "<|tool_call_start|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"Paris\"}}]<|tool_call_end|>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
+
+        // Test tool call with multiple arguments
+        common_chat_msg msg_multi_args;
+        msg_multi_args.role = "assistant";
+        msg_multi_args.tool_calls.push_back({"calculate", "{\"x\":10,\"y\":20,\"operation\":\"add\"}", ""});
+        assert_msg_equals(
+            msg_multi_args,
+            test_chat_parse(
+                "<|tool_call_start|>[{\"name\": \"calculate\", \"arguments\": {\"x\": 10, \"y\": 20, \"operation\": \"add\"}}]<|tool_call_end|>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
+
+        // Test multiple tool calls in single array
+        common_chat_msg msg_multiple_tools;
+        msg_multiple_tools.role = "assistant";
+        msg_multiple_tools.tool_calls.push_back({"get_weather", "{\"location\":\"Paris\"}", ""});
+        msg_multiple_tools.tool_calls.push_back({"get_time", "{\"timezone\":\"UTC\"}", ""});
+        assert_msg_equals(
+            msg_multiple_tools,
+            test_chat_parse(
+                "<|tool_call_start|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"Paris\"}}, {\"name\": \"get_time\", \"arguments\": {\"timezone\": \"UTC\"}}]<|tool_call_end|>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
+
+        // Test tool call with content before
+        common_chat_msg msg_content_before_tool;
+        msg_content_before_tool.role = "assistant";
+        msg_content_before_tool.content = "Let me check the weather for you.";
+        msg_content_before_tool.tool_calls.push_back({"get_weather", "{\"location\":\"Paris\"}", ""});
+        assert_msg_equals(
+            msg_content_before_tool,
+            test_chat_parse(
+                "Let me check the weather for you.<|tool_call_start|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"Paris\"}}]<|tool_call_end|>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
+
+        // Test tool call with content after
+        common_chat_msg msg_content_after_tool;
+        msg_content_after_tool.role = "assistant";
+        msg_content_after_tool.content = "Here's the result.";
+        msg_content_after_tool.tool_calls.push_back({"get_weather", "{\"location\":\"Paris\"}", ""});
+        assert_msg_equals(
+            msg_content_after_tool,
+            test_chat_parse(
+                "<|tool_call_start|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"Paris\"}}]<|tool_call_end|>Here's the result.",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
+
+        // Test tool call with newlines (common in LLM output)
+        common_chat_msg msg_tool_call_newlines;
+        msg_tool_call_newlines.role = "assistant";
+        msg_tool_call_newlines.tool_calls.push_back({"get_current_time", "{\"location\":\"Paris\"}", ""});
+        assert_msg_equals(
+            msg_tool_call_newlines,
+            test_chat_parse(
+                "<|tool_call_start|>[{\n    \"name\": \"get_current_time\",\n    \"arguments\": {\n        \"location\": \"Paris\"\n    }\n}]<|tool_call_end|>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
+
+        // Note: LFM2 uses JSON format for tool calls: [{"name": "...", "arguments": {...}}]
+        // Unlike other formats, LFM2 template does not render tool calls in conversation history,
+        // so we don't use test_templates() for tool call generation. Instead, the parsing tests
+        // above verify edge cases and format variations for the tool call output format.
+    }
+
+    {
+        auto tmpls = read_templates("models/templates/MiniMax-M2.jinja");
+        std::vector<std::string> end_tokens{ "[e~[" };
+
+        assert_equals(COMMON_CHAT_FORMAT_MINIMAX_M2, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_MINIMAX_M2, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        // Test parsing regular content
+        assert_msg_equals(message_assist,
+            test_chat_parse(
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_MINIMAX_M2}));
+
+        // Test parsing content with thinking
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+
+        // Test parsing tool calls
+        assert_msg_equals(message_assist_call,
+            test_chat_parse(
+                "<minimax:tool_call><invoke name=\"special_function\"><parameter name=\"arg1\">1</parameter></invoke></minimax:tool_call>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_MINIMAX_M2}));
+
+        // Test parsing tool calls with thinking
+        assert_msg_equals(message_assist_call_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think><minimax:tool_call><invoke name=\"special_function\"><parameter name=\"arg1\">1</parameter></invoke></minimax:tool_call>",
+                /* is_partial= */ false,
+                {
+                    /*  .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+                }));
+
+        // Test tool calls with extra content
+        assert_msg_equals(message_assist_call_content,
+            test_chat_parse(
+                "<minimax:tool_call><invoke name=\"special_function\"><parameter name=\"arg1\">1</parameter></invoke></minimax:tool_call>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_MINIMAX_M2}
+            ));
+
+        // Test tool calls with extra content AND thinking
+        assert_msg_equals(message_assist_call_thoughts_content,
+            test_chat_parse(
+                "<think>I'm\nthinking</think><minimax:tool_call><invoke name=\"special_function\"><parameter name=\"arg1\">1</parameter></invoke></minimax:tool_call>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /*  .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+                }));
+
+        // Test streaming
+        test_parser_with_streaming(message_assist_call_thoughts_content,
+            "<think>I'm\nthinking\n</think>Hello, world!\nWhat's up?\n<minimax:tool_call><invoke name=\"special_function\"><parameter name=\"arg1\">1</parameter></invoke></minimax:tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+        test_parser_with_streaming(message_assist_call_thoughts_unparsed,
+            "<think>I'm\nthinking</think>\n\n<minimax:tool_call><invoke name=\"special_function\"><parameter name=\"arg1\">1</parameter></invoke></minimax:tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
+            }); });
+        test_parser_with_streaming(message_assist_call_thoughts_content,
+            "<think>I'm\nthinking\n</think>\n\nHello, world!\nWhat's up?\n\n<minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter name=\"arg1\">1</parameter>\n</invoke>\n</minimax:tool_call>\n",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+        test_parser_with_streaming(message_assist_call_withopt,
+            "<minimax:tool_call>\n<invoke name=\"special_function_with_opt\">\n<parameter name=\"arg1\">1</parameter>\n<parameter name=\"arg2\">2</parameter>\n</invoke>\n</minimax:tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
+            }); });
+
+        // Test template generation for regular content
+        test_templates(tmpls.get(), end_tokens, message_assist, tools,
+                      "Hello, world!\nWhat's up?",
+                      /* expect_grammar_triggered= */ false);
+
+        // Test template generation for tool calls
+        test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+                      "<minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter name=\"arg1\">1</parameter>\n</invoke>\n</minimax:tool_call>",
+                      /* expect_grammar_triggered= */ true,
+                      /* test_grammar_if_triggered= */ true,
+                      /* common_reasoning_format= */ COMMON_REASONING_FORMAT_NONE,
+                      /* ignore_whitespace_differences= */ true
+        );
+
+        // Test template generation for tools with optional parameters
+        test_templates(tmpls.get(), end_tokens, message_assist_call_noopt, tools,
+                      "<minimax:tool_call>\n<invoke name=\"special_function_with_opt\">\n<parameter name=\"arg1\">1</parameter>\n</invoke>\n</minimax:tool_call>",
+                      /* expect_grammar_triggered= */ true,
+                      /* test_grammar_if_triggered= */ true,
+                      /* common_reasoning_format= */ COMMON_REASONING_FORMAT_NONE,
+                      /* ignore_whitespace_differences= */ true
+        );
+        test_templates(tmpls.get(), end_tokens, message_assist_call_withopt, tools,
+                      "<minimax:tool_call>\n<invoke name=\"special_function_with_opt\">\n<parameter name=\"arg1\">1</parameter>\n<parameter name=\"arg2\">2</parameter>\n</invoke>\n</minimax:tool_call>",
+                      /* expect_grammar_triggered= */ true,
+                      /* test_grammar_if_triggered= */ true,
+                      /* common_reasoning_format= */ COMMON_REASONING_FORMAT_NONE,
+                      /* ignore_whitespace_differences= */ true
+        );
+    }
+
+    {
+        auto tmpls = read_templates("models/templates/GLM-4.6.jinja");
+        std::vector<std::string>   end_tokens{ "<|assistant|>", "<|observation|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_GLM_4_5, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_GLM_4_5, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        // Test parsing regular content
+        assert_msg_equals(message_assist,
+            test_chat_parse(
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_GLM_4_5}));
+
+        // Test parsing content with thinking
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "\n<think>I'm\nthinking</think>\nHello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }), true);
+
+        // Test parsing tool calls
+        assert_msg_equals(message_assist_call,
+            test_chat_parse(
+                "\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_GLM_4_5}), true);
+
+        // Test parsing tool calls with thinking
+        assert_msg_equals(message_assist_call_thoughts,
+            test_chat_parse(
+                "\n<think>I'm\nthinking</think>\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
+                /* is_partial= */ false,
+                {
+                    /*  .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+                }), true);
+
+        // Test tool calls with extra content
+        assert_msg_equals(message_assist_call_content,
+            test_chat_parse(
+                "\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_GLM_4_5}
+            ), true);
+
+        // Test tool calls with extra content AND thinking
+        assert_msg_equals(message_assist_call_thoughts_content,
+            test_chat_parse(
+                "\n<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
+                /* is_partial= */ false,
+                {
+                    /*  .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+                }), true);
+
+        // Test streaming
+        test_parser_with_streaming(message_assist_call_thoughts_content,
+            "\n<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+        test_parser_with_streaming(message_assist_call_thoughts_unparsed,
+            "\n<think>I'm\nthinking</think>\n\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
+            }); });
+        test_parser_with_streaming(message_assist_call_withopt,
+            "\n<think></think>\n<tool_call>special_function_with_opt\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n<arg_key>arg2</arg_key>\n<arg_value>2</arg_value>\n</tool_call>\n",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+            test_parser_with_streaming(
+                simple_assist_msg("", "", "complex_function", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}"),
+                "<tool_call>complex_function\n"
+                "<arg_key>name</arg_key>\n"
+                "<arg_value>John Doe</arg_value>\n"
+                "<arg_key>age</arg_key>\n"
+                "<arg_value>30</arg_value>\n"
+                "<arg_key>active</arg_key>\n"
+                "<arg_value>true</arg_value>\n"
+                "<arg_key>score</arg_key>\n"
+                "<arg_value>95.5</arg_value>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_GLM_4_5}); });
+        test_parser_with_streaming(
+                simple_assist_msg("", "", "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"),
+                "<tool_call>web_search\n"
+                "<arg_key>query</arg_key>\n"
+                "<arg_value>\"From Zero\" Linkin Park album tracklist complete songs</arg_value>\n"
+                "<arg_key>limit</arg_key>\n"
+                "<arg_value>3</arg_value>\n"
+                "<arg_key>type</arg_key>\n"
+                "<arg_value>text</arg_value>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_GLM_4_5}); });
+
+        // Test interleaved thinking
+        test_parser_with_streaming(simple_assist_msg("Hello, world!\n\nWhat's up?", "I'm\nthinkingThinking2", "special_function", "{\"arg1\": 1}"),
+            "\n<think>I'm\nthinking</think>Hello, world!\n<think>Thinking2</think>What's up?\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+        test_parser_with_streaming(simple_assist_msg("\n<think>I'm\nthinking</think>Hello, world!\n<think>Thinking2</think>What's up?", "", "special_function", "{\"arg1\": 1}"),
+            "\n<think>I'm\nthinking</think>Hello, world!\n<think>Thinking2</think>What's up?\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
+            }); });
+
+        // Test template generation for regular content
+        test_templates(tmpls.get(), end_tokens, message_assist, tools,
+                      "\n<think></think>\nHello, world!\nWhat's up?",
+                      /* expect_grammar_triggered= */ false);
+
+        // Test template generation for tool calls
+        test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+                      "\n<think></think>\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>\n",
+                      /* expect_grammar_triggered= */ true,
+                      /* test_grammar_if_triggered= */ false,
+                      /* common_reasoning_format= */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                      /* ignore_whitespace_differences= */ true
+        );
+
+        // Test template generation for tools with optional parameters
+        test_templates(tmpls.get(), end_tokens, message_assist_call_noopt, tools,
+                      "\n<think></think>\n<tool_call>special_function_with_opt\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>\n",
+                      /* expect_grammar_triggered= */ true,
+                      /* test_grammar_if_triggered= */ false,
+                      /* common_reasoning_format= */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                      /* ignore_whitespace_differences= */ true
+        );
+        test_templates(tmpls.get(), end_tokens, message_assist_call_withopt, tools,
+                      "\n<think></think>\n<tool_call>special_function_with_opt\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n<arg_key>arg2</arg_key>\n<arg_value>2</arg_value>\n</tool_call>\n",
+                      /* expect_grammar_triggered= */ true,
+                      /* test_grammar_if_triggered= */ false,
+                      /* common_reasoning_format= */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                      /* ignore_whitespace_differences= */ true
+        );
+    }
+
+    {
+        auto tmpls = read_templates("models/templates/Kimi-K2-Thinking.jinja");
+        std::vector<std::string> end_tokens{ "<|im_end|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_KIMI_K2, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_KIMI_K2, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        // Test parsing regular content
+        assert_msg_equals(message_assist,
+            test_chat_parse(
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_KIMI_K2}));
+
+        // Test parsing content with thinking
+        assert_msg_equals(message_assist_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+
+        // Test parsing tool calls
+        assert_msg_equals(message_assist_call,
+            test_chat_parse(
+                "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_KIMI_K2}));
+
+        // Test parsing tool calls with thinking
+        assert_msg_equals(message_assist_call_thoughts,
+            test_chat_parse(
+                "<think>I'm\nthinking</think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
+                /* is_partial= */ false,
+                {
+                    /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+                }));
+
+        // Test tool calls with extra content
+        assert_msg_equals(message_assist_call_content,
+            test_chat_parse(
+                "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_KIMI_K2}
+            ));
+
+        // Test tool calls with extra content AND thinking
+        assert_msg_equals(message_assist_call_thoughts_content,
+            test_chat_parse(
+                "<think>I'm\nthinking</think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+                }));
+
+        // Test streaming
+        test_parser_with_streaming(message_assist_call_thoughts_content,
+            "<think>I'm\nthinking\n</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+        test_parser_with_streaming(message_assist_call_thoughts_unparsed,
+            "<think>I'm\nthinking</think>\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
+            }); });
+        test_parser_with_streaming(message_assist_call_thoughts_content,
+            "<think>I'm\nthinking\n</think>\n\nHello, world!\nWhat's up?\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>\n",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+        test_parser_with_streaming(message_assist_call_withopt,
+            "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:0<|tool_call_argument_begin|>{\"arg1\": 1, \"arg2\": 2}<|tool_call_end|><|tool_calls_section_end|>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
+            }); });
+        test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": \"123456\"}"),
+            "<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": \"123456\"}<|tool_call_end|><|tool_calls_section_end|>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+        test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": [1, 2, \"345\", 6]}"),
+            "<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": [1, 2, \"345\", 6]}<|tool_call_end|><|tool_calls_section_end|>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+        test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": {\"12\": 34, \"5\": [67, 8], \"9\": \"10\"}}"),
+            "<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": {\"12\": 34, \"5\": [67, 8], \"9\": \"10\"}}<|tool_call_end|><|tool_calls_section_end|>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
+                    /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+        test_parser_with_streaming(
+                simple_assist_msg("", "", "complex_function", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}"),
+                "<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function:0<|tool_call_argument_begin|>"
+                "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}"
+                "<|tool_call_end|><|tool_calls_section_end|>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
+        test_parser_with_streaming(
+                simple_assist_msg("", "", "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"),
+                "<|tool_calls_section_begin|><|tool_call_begin|>functions.web_search:0<|tool_call_argument_begin|>"
+                "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"
+                "<|tool_call_end|><|tool_calls_section_end|>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
+        test_parser_with_streaming(
+                simple_assist_msg("", "", "read_file", "{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}"),
+                "<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>"
+                "{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}"
+                "<|tool_call_end|><|tool_calls_section_end|>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
+        test_parser_with_streaming(
+                simple_assist_msg(
+                        "Let me start by examining the relevant files to understand the current implementation.", "",
+                        "read_file",
+                        "{\"files\": [{\"path\": \"src/app/Partners.tsx\", \"line_ranges\": [\"1-100\"]}]}"),
+                "Let me start by examining the relevant files to understand the current implementation."
+                "<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>"
+                "{\"files\":[{\"path\":\"src/app/Partners.tsx\",\"line_ranges\":[\"1-100\"]}]}"
+                "<|tool_call_end|><|tool_calls_section_end|>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
+        auto multi_tool_msg = simple_assist_msg("Let me call multiple tools.", "I'm thinking.");
+        multi_tool_msg.tool_calls.push_back({ "read_file", "{\"files\": [{\"path\": \"src/app/Partners.tsx\", \"line_ranges\": [\"1-100\"]}]}", "" });
+        multi_tool_msg.tool_calls.push_back({ "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}", "" });
+        multi_tool_msg.tool_calls.push_back({ "complex_function", "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}", "" });
+        multi_tool_msg.tool_calls.push_back({ "emoji_function", "{\"message\":\"Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\"}", "" });
+        test_parser_with_streaming(multi_tool_msg,
+                "<think>I'm thinking.</think>Let me call multiple tools."
+                "<|tool_calls_section_begin|>"
+                "<|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>"
+                "{\"files\":[{\"path\":\"src/app/Partners.tsx\",\"line_ranges\":[\"1-100\"]}]}"
+                "<|tool_call_end|>"
+                "<|tool_call_begin|>functions.web_search:1<|tool_call_argument_begin|>"
+                "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"
+                "<|tool_call_end|>"
+                "<|tool_call_begin|>functions.complex_function:2<|tool_call_argument_begin|>"
+                "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}"
+                "<|tool_call_end|>"
+                "<|tool_call_begin|>functions.emoji_function:3<|tool_call_argument_begin|>"
+                "{\"message\":\"Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\"}"
+                "<|tool_call_end|>"
+                "<|tool_calls_section_end|>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    COMMON_CHAT_FORMAT_KIMI_K2,
+                    COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+        test_parser_with_streaming(
+                simple_assist_msg("", "I'm thinking", "complex_function_in_think", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}"),
+                "<think>I'm thinking<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function_in_think:0<|tool_call_argument_begin|>"
+                "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}"
+                "<|tool_call_end|><|tool_calls_section_end|>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    COMMON_CHAT_FORMAT_KIMI_K2,
+                    COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+        test_parser_with_streaming(
+                simple_assist_msg("Hello", "I'm thinkingI'm still thinking", "complex_function_in_think", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}"),
+                "<think>I'm thinking<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function_in_think:0<|tool_call_argument_begin|>"
+                "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}"
+                "<|tool_call_end|><|tool_calls_section_end|>I'm still thinking</think>Hello",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
+                    COMMON_CHAT_FORMAT_KIMI_K2,
+                    COMMON_REASONING_FORMAT_DEEPSEEK
+            }); });
+
+        // Test template rendering
+        common_chat_templates_inputs conversation_with_tools = inputs_tools;
+        conversation_with_tools.messages.push_back(simple_assist_msg("Let's do it", "Think first", "complex_function", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}"));
+        conversation_with_tools.messages.push_back({
+            "tool",
+            "Tool response 1",
+            /* .content_parts = */ {},
+            /* .tool_calls = */ {},
+            /* .reasoning_content = */ "",
+            /* .tool_name = */ "complex_function",
+            /* .tool_call_id = */ "",
+        });
+        conversation_with_tools.messages.push_back(simple_assist_msg("Continue", "Think next", "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"));
+        conversation_with_tools.messages.push_back({
+            "tool",
+            "Tool response 2",
+            /* .content_parts = */ {},
+            /* .tool_calls = */ {},
+            /* .reasoning_content = */ "",
+            /* .tool_name = */ "web_search",
+            /* .tool_call_id = */ "",
+        });
+        conversation_with_tools.messages.push_back(simple_assist_msg("CC", "Think last", "read_file", "{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}"));
+        conversation_with_tools.messages.push_back({
+            "tool",
+            "Tool response 3",
+            /* .content_parts = */ {},
+            /* .tool_calls = */ {},
+            /* .reasoning_content = */ "",
+            /* .tool_name = */ "read_file",
+            /* .tool_call_id = */ "",
+        });
+        assert_equals(common_chat_templates_apply(tmpls.get(), conversation_with_tools).prompt, std::string("<|im_system|>tool_declare<|im_middle|>[{\"type\": \"function\", \"function\": {\"name\": \"special_function\", \"description\": \"I'm special\", \"parameters\": {\"type\": \"object\", \"properties\": {\"arg1\": {\"type\": \"integer\", \"description\": \"The arg.\"}}, \"required\": [\"arg1\"]}}}]<|im_end|><|im_system|>system<|im_middle|>You are Kimi, an AI assistant created by Moonshot AI.<|im_end|><|im_user|>user<|im_middle|>Hey there!<|im_end|><|im_assistant|>assistant<|im_middle|><think>Think first</think>Let's do it<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function:0<|tool_call_argument_begin|>{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}<|tool_call_end|><|tool_calls_section_end|><|im_end|><|im_system|>complex_function<|im_middle|>## Return of functions.complex_function:0\nTool response 1<|im_end|><|im_assistant|>assistant<|im_middle|><think>Think next</think>Continue<|tool_calls_section_begin|><|tool_call_begin|>functions.web_search:1<|tool_call_argument_begin|>{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}<|tool_call_end|><|tool_calls_section_end|><|im_end|><|im_system|>web_search<|im_middle|>## Return of functions.web_search:1\nTool response 2<|im_end|><|im_assistant|>assistant<|im_middle|><think>Think last</think>CC<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:2<|tool_call_argument_begin|>{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}<|tool_call_end|><|tool_calls_section_end|><|im_end|><|im_system|>read_file<|im_middle|>## Return of functions.read_file:2\nTool response 3<|im_end|><|im_assistant|>assistant<|im_middle|>"));
+
+        // Test template generation for regular content
+        test_templates(tmpls.get(), end_tokens, message_assist, tools,
+                      "<think></think>Hello, world!\nWhat's up?",
+                      /* expect_grammar_triggered= */ false);
+
+        // Test template generation for tool calls
+        test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
+                      "<think></think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
+                      /* expect_grammar_triggered= */ true,
+                      /* test_grammar_if_triggered= */ true,
+                      /* common_reasoning_format= */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                      /* ignore_whitespace_differences= */ true
+        );
+
+        // Test template generation for tools with optional parameters
+        test_templates(tmpls.get(), end_tokens, message_assist_call_noopt, tools,
+                      "<think></think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
+                      /* expect_grammar_triggered= */ true,
+                      /* test_grammar_if_triggered= */ true,
+                      /* common_reasoning_format= */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                      /* ignore_whitespace_differences= */ true
+        );
+        test_templates(tmpls.get(), end_tokens, message_assist_call_withopt, tools,
+                      "<think></think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:0<|tool_call_argument_begin|>{\"arg1\": 1, \"arg2\": 2}<|tool_call_end|><|tool_calls_section_end|>",
+                      /* expect_grammar_triggered= */ true,
+                      /* test_grammar_if_triggered= */ true,
+                      /* common_reasoning_format= */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                      /* ignore_whitespace_differences= */ true
+        );
+    }
+
+    // Test Qwen3-Coder XML format
+    {
+        // Basic XML tool call parsing
+        assert_msg_equals(
+            message_assist_call,
+            test_chat_parse(
+                "<tool_call>\n"
+                "  <function=special_function>\n"
+                "    <parameter=arg1>\n"
+                "      1\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}));
+
+        // Multiple parameters with different types
+        common_chat_msg expected_multi_param;
+        expected_multi_param.role = "assistant";
+        expected_multi_param.tool_calls = {
+            { "complex_function", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}", "" }
+        };
+
+        test_parser_with_streaming(expected_multi_param,
+                "<tool_call>\n"
+                "  <function=complex_function>\n"
+                "    <parameter=name>\n"
+                "      John Doe\n"
+                "    </parameter>\n"
+                "    <parameter=age>\n"
+                "      30\n"
+                "    </parameter>\n"
+                "    <parameter=active>\n"
+                "      true\n"
+                "    </parameter>\n"
+                "    <parameter=score>\n"
+                "      95.5\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Special characters and Unicode
+        common_chat_msg expected_special_chars;
+        expected_special_chars.role = "assistant";
+        expected_special_chars.tool_calls = {
+            { "unicode_function", "{\"message\":\"Hello 世界! 🌍 Special chars: @#$%^&*()\"}", "" }
+        };
+
+        test_parser_with_streaming(expected_special_chars,
+                "<tool_call>\n"
+                "  <function=unicode_function>\n"
+                "    <parameter=message>\n"
+                "      Hello 世界! 🌍 Special chars: @#$%^&*()\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Multiline content with newlines and indentation
+        common_chat_msg expected_multiline;
+        expected_multiline.role = "assistant";
+        expected_multiline.tool_calls = {
+            { "code_function", "{\"code\":\"def hello():\\n    print(\\\"Hello, World!\\\")\\n    return True\"}", "" }
+        };
+
+        test_parser_with_streaming(expected_multiline,
+                "<tool_call>\n"
+                "  <function=code_function>\n"
+                "    <parameter=code>\n"
+                "def hello():\n"
+                "    print(\"Hello, World!\")\n"
+                "    return True\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // JSON object as parameter value
+        common_chat_msg expected_json_param;
+        expected_json_param.role = "assistant";
+        expected_json_param.tool_calls = {
+            { "json_function", "{\"config\":{\"host\":\"localhost\",\"port\":8080,\"ssl\":false}}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_json_param,
+                "<tool_call>\n"
+                "  <function=json_function>\n"
+                "    <parameter=config>\n"
+                "      {\"host\": \"localhost\", \"port\": 8080, \"ssl\": false}\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Array as parameter value
+        common_chat_msg expected_array_param;
+        expected_array_param.role = "assistant";
+        expected_array_param.tool_calls = {
+            { "array_function", "{\"items\":[\"apple\",\"banana\",\"cherry\"]}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_array_param,
+                "<tool_call>\n"
+                "  <function=array_function>\n"
+                "    <parameter=items>\n"
+                "      [\"apple\", \"banana\", \"cherry\"]\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Empty parameter
+        common_chat_msg expected_empty_param;
+        expected_empty_param.role = "assistant";
+        expected_empty_param.tool_calls = {
+            { "empty_function", "{\"empty_param\":\"\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_empty_param,
+                "<tool_call>\n"
+                "  <function=empty_function>\n"
+                "    <parameter=empty_param>\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Boolean values (true/false)
+        common_chat_msg expected_boolean;
+        expected_boolean.role = "assistant";
+        expected_boolean.tool_calls = {
+            { "boolean_function", "{\"enabled\":true,\"debug\":false}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_boolean,
+                "<tool_call>\n"
+                "  <function=boolean_function>\n"
+                "    <parameter=enabled>\n"
+                "      true\n"
+                "    </parameter>\n"
+                "    <parameter=debug>\n"
+                "      false\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Null value
+        common_chat_msg expected_null;
+        expected_null.role = "assistant";
+        expected_null.tool_calls = {
+            { "null_function", "{\"optional_param\":null}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_null,
+                "<tool_call>\n"
+                "  <function=null_function>\n"
+                "    <parameter=optional_param>\n"
+                "      null\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Negative numbers and scientific notation
+        common_chat_msg expected_numbers;
+        expected_numbers.role = "assistant";
+        expected_numbers.tool_calls = {
+            { "math_function", "{\"negative\":-42,\"decimal\":-3.14,\"scientific\":1.23e-4}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_numbers,
+                "<tool_call>\n"
+                "  <function=math_function>\n"
+                "    <parameter=negative>\n"
+                "      -42\n"
+                "    </parameter>\n"
+                "    <parameter=decimal>\n"
+                "      -3.14\n"
+                "    </parameter>\n"
+                "    <parameter=scientific>\n"
+                "      1.23e-4\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // XML-like content in parameters (should be escaped)
+        common_chat_msg expected_xml_content;
+        expected_xml_content.role = "assistant";
+        expected_xml_content.tool_calls = {
+            { "xml_function", "{\"xml_content\":\"<root><item>value</item></root>\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_xml_content,
+                "<tool_call>\n"
+                "  <function=xml_function>\n"
+                "    <parameter=xml_content>\n"
+                "      <root><item>value</item></root>\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Quotes and escape characters
+        common_chat_msg expected_quotes;
+        expected_quotes.role = "assistant";
+        expected_quotes.tool_calls = {
+            { "quote_function", "{\"message\":\"She said \\\"Hello!\\\" and left.\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_quotes,
+                "<tool_call>\n"
+                "  <function=quote_function>\n"
+                "    <parameter=message>\n"
+                "      She said \"Hello!\" and left.\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Long parameter value (simplified)
+        std::string long_text = "This is a long text parameter that should test the parser's ability to handle larger amounts of text data.";
+
+        common_chat_msg expected_long_text;
+        expected_long_text.role = "assistant";
+        expected_long_text.tool_calls = {
+            { "long_function", "{\"long_text\":\"" + long_text + "\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_long_text,
+                "<tool_call>\n"
+                "  <function=long_function>\n"
+                "    <parameter=long_text>\n"
+                "      " + long_text + "\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Mixed content with text before and after tool call
+        common_chat_msg expected_mixed_content;
+        expected_mixed_content.role = "assistant";
+        expected_mixed_content.content = "I'll help you search for products. ";
+        expected_mixed_content.tool_calls = {
+            { "search_function", "{\"query\":\"laptops\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_mixed_content,
+                "I'll help you search for products. <tool_call>\n"
+                "  <function=search_function>\n"
+                "    <parameter=query>\n"
+                "      laptops\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Compact format (no extra whitespace)
+        common_chat_msg expected_compact;
+        expected_compact.role = "assistant";
+        expected_compact.tool_calls = {
+            { "compact_function", "{\"param\":\"value\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_compact,
+                "<tool_call><function=compact_function><parameter=param>value</parameter></function></tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Function name with underscores and numbers
+        common_chat_msg expected_complex_name;
+        expected_complex_name.role = "assistant";
+        expected_complex_name.tool_calls = {
+            { "get_user_data_v2", "{\"user_id\":12345}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_complex_name,
+                "<tool_call>\n"
+                "  <function=get_user_data_v2>\n"
+                "    <parameter=user_id>\n"
+                "      12345\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Parameter names with underscores and numbers
+        common_chat_msg expected_complex_params;
+        expected_complex_params.role = "assistant";
+        expected_complex_params.tool_calls = {
+            { "test_function", "{\"param_1\":\"value1\",\"param_2_name\":\"value2\",\"param3\":123}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_complex_params,
+                "<tool_call>\n"
+                "  <function=test_function>\n"
+                "    <parameter=param_1>\n"
+                "      value1\n"
+                "    </parameter>\n"
+                "    <parameter=param_2_name>\n"
+                "      value2\n"
+                "    </parameter>\n"
+                "    <parameter=param3>\n"
+                "      123\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Very deeply nested XML content in parameter
+        common_chat_msg expected_deep_xml;
+        expected_deep_xml.role = "assistant";
+        expected_deep_xml.tool_calls = {
+            { "xml_parser", "{\"xml\":\"<root><level1><level2><level3>deep content</level3></level2></level1></root>\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_deep_xml,
+                "<tool_call>\n"
+                "  <function=xml_parser>\n"
+                "    <parameter=xml>\n"
+                "      <root><level1><level2><level3>deep content</level3></level2></level1></root>\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Parameter with only whitespace
+        common_chat_msg expected_whitespace_param;
+        expected_whitespace_param.role = "assistant";
+        expected_whitespace_param.tool_calls = {
+            { "whitespace_function", "{\"spaces\":\"\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_whitespace_param,
+                "<tool_call>\n"
+                "  <function=whitespace_function>\n"
+                "    <parameter=spaces>\n"
+                "      \n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Parameter with tabs and mixed whitespace
+        common_chat_msg expected_mixed_whitespace;
+        expected_mixed_whitespace.role = "assistant";
+        expected_mixed_whitespace.tool_calls = {
+            { "tab_function", "{\"content\":\"line1\\n\\tindented line\\n    spaces\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_mixed_whitespace,
+                "<tool_call>\n"
+                "  <function=tab_function>\n"
+                "    <parameter=content>\n"
+                "line1\n"
+                "\tindented line\n"
+                "    spaces\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Control characters and special Unicode
+        common_chat_msg expected_control_chars;
+        expected_control_chars.role = "assistant";
+        expected_control_chars.tool_calls = {
+            { "control_function", "{\"text\":\"Line1\\nLine2\\tTabbed\\rCarriage return\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_control_chars,
+                "<tool_call>\n"
+                "  <function=control_function>\n"
+                "    <parameter=text>\n"
+                "Line1\nLine2\tTabbed\rCarriage return\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Emoji and extended Unicode characters
+        common_chat_msg expected_emoji;
+        expected_emoji.role = "assistant";
+        expected_emoji.tool_calls = {
+            { "emoji_function", "{\"message\":\"Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_emoji,
+                "<tool_call>\n"
+                "  <function=emoji_function>\n"
+                "    <parameter=message>\n"
+                "      Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Mathematical expressions and formulas
+        common_chat_msg expected_math;
+        expected_math.role = "assistant";
+        expected_math.tool_calls = {
+            { "math_function", "{\"formula\":\"E = mc² and ∫f(x)dx = F(x) + C\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_math,
+                "<tool_call>\n"
+                "  <function=math_function>\n"
+                "    <parameter=formula>\n"
+                "      E = mc² and ∫f(x)dx = F(x) + C\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // SQL injection-like content (should be safely escaped)
+        common_chat_msg expected_sql;
+        expected_sql.role = "assistant";
+        expected_sql.tool_calls = {
+            { "sql_function", "{\"query\":\"SELECT * FROM users WHERE id = 1; DROP TABLE users; --\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_sql,
+                "<tool_call>\n"
+                "  <function=sql_function>\n"
+                "    <parameter=query>\n"
+                "      SELECT * FROM users WHERE id = 1; DROP TABLE users; --\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // HTML/XML injection content
+        common_chat_msg expected_html;
+        expected_html.role = "assistant";
+        expected_html.tool_calls = {
+            { "html_function", "{\"content\":\"<script>alert('xss')</script><img src=x onerror=alert(1)>\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_html,
+                "<tool_call>\n"
+                "  <function=html_function>\n"
+                "    <parameter=content>\n"
+                "      <script>alert('xss')</script><img src=x onerror=alert(1)>\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Binary-like content (base64)
+        common_chat_msg expected_binary;
+        expected_binary.role = "assistant";
+        expected_binary.tool_calls = {
+            { "binary_function", "{\"data\":\"SGVsbG8gV29ybGQhIFRoaXMgaXMgYmFzZTY0IGVuY29kZWQgdGV4dC4=\"}", "" }
+        };
+
+        test_parser_with_streaming(
+            expected_binary,
+                "<tool_call>\n"
+                "  <function=binary_function>\n"
+                "    <parameter=data>\n"
+                "      SGVsbG8gV29ybGQhIFRoaXMgaXMgYmFzZTY0IGVuY29kZWQgdGV4dC4=\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+
+        // Very large numbers (should be parsed as scientific notation)
+        common_chat_msg expected_large_numbers;
+        expected_large_numbers.role = "assistant";
+        expected_large_numbers.tool_calls = {
+            { "number_function", "{\"big_int\":1e+60}", "" }  // Large number becomes scientific notation
+        };
+
+        test_parser_with_streaming(
+            expected_large_numbers,
+                "<tool_call>\n"
+                "  <function=number_function>\n"
+                "    <parameter=big_int>\n"
+                "      999999999999999999999999999999999999999999999999999999999999\n"
+                "    </parameter>\n"
+                "  </function>\n"
+                "</tool_call>",
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+    }
+
+    {
+        // Qwen3-Coder template
+        auto tmpls = read_templates("models/templates/Qwen3-Coder.jinja");
+        common_chat_templates_inputs inputs;
+        inputs.messages = { message_user };
+
+        common_chat_tool qwen_union_tool {
+            /* .name = */ "qwen_union",
+            /* .description = */ "Test tool for union/anyOf handling",
+            /* .parameters = */ R"({
+                "type": "object",
+                "properties": {
+                    "priority": { "type": ["number", "null"] },
+                    "maybe_text": { "anyOf": [ { "type": "string" } ] },
+                    "config": { "anyOf": [ { "type": "object" }, { "type": "null" } ] }
+                },
+                "required": []
+            })",
+        };
+        inputs.tools = { qwen_union_tool };
+
+        auto params = common_chat_templates_apply(tmpls.get(), inputs);
+        assert_equals(COMMON_CHAT_FORMAT_QWEN3_CODER_XML, params.format);
+        assert_equals(false, params.grammar.empty());
+
+        // Grammar should compile successfully
+        auto grammar = build_grammar(params.grammar);
+        GGML_ASSERT(grammar && "Failed to build Qwen3-Coder grammar with union types");
+    }
+}
+
+static void test_template_output_peg_parsers() {
+    printf("[%s]\n", __func__);
+
+    // JSON schemas
+    const char * invoice_schema = R"({
+        "type": "object",
+        "properties": {
+            "amount": {"type": "number"},
+            "date": {"type": "string"}
+        }
+    })";
+
+    {
+        // Ministral-3-14B-Reasoning-2512
+        auto tmpls = read_templates("models/templates/mistralai-Ministral-3-14B-Reasoning-2512.jinja");
+
+        // Test basic message
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "Hello, world!\nWhat's up?";
+            t.expect = message_assist;
+        });
+
+        // Test basic message and reasoning with reasoning_format = none
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?";
+            t.expect.content = "[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?";
+        });
+
+        // Test basic message and reasoning with reasoning_format = auto
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?";
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+
+            t.expect = message_assist_thoughts;
+        });
+
+        // Test tool call
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = R"([TOOL_CALLS]special_function[ARGS]{"arg1":1})";
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.tools = {special_function_tool};
+
+            t.expect = message_assist_call;
+        });
+
+        // Test tool call with reasoning
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "[THINK]I'm\nthinking[/THINK]"
+                      R"([TOOL_CALLS]special_function[ARGS]{"arg1":1})";
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.tools = {special_function_tool};
+
+            t.expect = message_assist_call_thoughts;
+        });
+
+        // Test parallel tool calls
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = R"([TOOL_CALLS]special_function[ARGS]{"arg1": 1})"
+                      R"([TOOL_CALLS]special_function_with_opt[ARGS]{"arg1": 1, "arg2": 2})";
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.parallel_tool_calls = true;
+            t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
+
+            t.expect.tool_calls = {{
+                /* .name = */      "special_function",
+                /* .arguments = */ R"({"arg1": 1})",
+                /* .id = */        {},
+            }, {
+                /* .name = */      "special_function_with_opt",
+                /* .arguments = */ R"({"arg1": 1, "arg2": 2})",
+                /* .id = */        {},
+            }};
+        });
+
+        // Test response format
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "[THINK]I need to output the invoice details in JSON[/THINK]"
+                      "```json\n"
+                      R"({"amount": 123.45, "date": "2025-12-03"})"
+                      "\n```";
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.json_schema = invoice_schema;
+
+            t.expect.reasoning_content = "I need to output the invoice details in JSON";
+            t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
+        });
+    }
+
+    {
+        // NVIDIA Nemotron-3 Nano
+        auto tmpls = read_templates("models/templates/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja");
+
+        // Test basic message
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "Hello, world!\nWhat's up?";
+            t.expect = message_assist;
+        });
+
+        // Test basic message and reasoning with reasoning_format = none
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
+            t.expect.content = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
+        });
+
+        // Test basic message and reasoning with reasoning_format = auto
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "I'm\nthinking\n</think>\nHello, world!\nWhat's up?";
+            t.params.enable_thinking = true;
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+
+            t.expect = message_assist_thoughts;
+        });
+
+        // Test tool call
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input =
+                "<tool_call>\n"
+                "<function=special_function>\n"
+                "<parameter=arg1>\n"
+                "1\n"
+                "</parameter>\n"
+                "</function>\n"
+                "</tool_call>";
+            t.params.enable_thinking = false;
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.tools = {special_function_tool};
+
+            t.expect = message_assist_call;
+        });
+
+        // Test tool call with reasoning
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input =
+                "I'm\nthinking\n</think>\n"
+                "<tool_call>\n"
+                "<function=special_function>\n"
+                "<parameter=arg1>\n"
+                "1\n"
+                "</parameter>\n"
+                "</function>\n"
+                "</tool_call>";
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.tools = {special_function_tool};
+
+            t.expect = message_assist_call_thoughts;
+        });
+
+        // Test parallel tool calls
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input =
+                "<tool_call>\n"
+                "<function=special_function>\n"
+                "<parameter=arg1>\n"
+                "1\n"
+                "</parameter>\n"
+                "</function>\n"
+                "</tool_call>\n"
+                "<tool_call>\n"
+                "<function=special_function_with_opt>\n"
+                "<parameter=arg1>\n"
+                "1\n"
+                "</parameter>\n"
+                "<parameter=arg2>\n"
+                "2\n"
+                "</parameter>\n"
+                "</function>\n"
+                "</tool_call>";
+            t.params.enable_thinking = false;
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.parallel_tool_calls = true;
+            t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
+
+            t.expect.tool_calls = {{
+                /* .name = */      "special_function",
+                /* .arguments = */ R"({"arg1": 1})",
+                /* .id = */        {},
+            }, {
+                /* .name = */      "special_function_with_opt",
+                /* .arguments = */ R"({"arg1": 1, "arg2": 2})",
+                /* .id = */        {},
+            }};
+        });
+
+        // Test tool call with string parameter
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input =
+                "<tool_call>\n"
+                "<function=python>\n"
+                "<parameter=code>\n"
+                "def hello():\n"
+                "    print(\"Hello, world!\")\n"
+                "\n"
+                "hello()\n"
+                "</parameter>\n"
+                "</function>\n"
+                "</tool_call>";
+            t.params.enable_thinking = false;
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.tools = {python_tool};
+
+            t.expect.tool_calls = {{
+                /* .name = */      "python",
+                /* .arguments = */ "{\"code\": \"def hello():\\n    print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
+                /* .id = */        {},
+            }};
+        });
+
+        // Test tool call with string parameter and no closing </parameter> tag
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input =
+                "<tool_call>\n"
+                "<function=python>\n"
+                "<parameter=code>\n"
+                "def hello():\n"
+                "    print(\"Hello, world!\")\n"
+                "\n"
+                "hello()\n"
+                "</function>\n"
+                "</tool_call>";
+            t.params.enable_thinking = false;
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.tools = {python_tool};
+
+            t.expect.tool_calls = {{
+                /* .name = */      "python",
+                /* .arguments = */ "{\"code\": \"def hello():\\n    print(\\\"Hello, world!\\\")\\n\\nhello()\"}",
+                /* .id = */        {},
+            }};
+        });
+
+        // Test response format
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input =
+              "I need to output the invoice details in JSON\n"
+              "</think>\n"
+              R"({"amount": 123.45, "date": "2025-12-03"})";
+            t.params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+            t.params.json_schema = invoice_schema;
+
+            t.expect.reasoning_content = "I need to output the invoice details in JSON";
+            t.expect.content = R"({"amount": 123.45, "date": "2025-12-03"})";
+        });
+    }
+
+    {
+        // Solar-Open-100B
+        auto tmpls = read_templates("models/templates/upstage-Solar-Open-100B.jinja");
+
+        // Test basic message
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "<|content|>Hello, world!\nWhat's up?";
+            t.expect = message_assist;
+        });
+
+        // Test basic message and reasoning
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "<|think|>I'm\nthinking<|end|><|begin|>assistant<|content|>Hello, world!\nWhat's up?";
+            t.expect = message_assist_thoughts;
+        });
+
+        // Test basic message and reasoning_effort = low
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "<|content|>Hello, world!\nWhat's up?";
+            t.params.chat_template_kwargs["reasoning_effort"] = "\"low\"";
+            t.expect = message_assist;
+        });
+
+        // Test tool call
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "<|tool_calls|>"
+                      "<|tool_call:begin|>123456789"
+                      "<|tool_call:name|>special_function"
+                      "<|tool_call:args|>{\"arg1\":1}"
+                      "<|tool_call:end|>";
+
+            t.params.chat_template_kwargs["reasoning_effort"] = "\"low\"";
+            t.params.tools = {special_function_tool};
+            t.expect = message_assist_call_id;
+        });
+
+        // Test tool call with reasoning
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "<|think|>I'm\nthinking<|end|>"
+                      "<|begin|>assistant<|tool_calls|>"
+                      "<|tool_call:begin|>0"
+                      "<|tool_call:name|>special_function"
+                      "<|tool_call:args|>{\"arg1\":1}"
+                      "<|tool_call:end|>";
+
+            t.params.tools = {special_function_tool};
+            t.expect = message_assist_thoughts_call_idx;
+        });
+
+        // Test tool call with reasoning and tool_choice = required
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "<|think|>I'm\nthinking<|end|>"
+                      "<|begin|>assistant<|tool_calls|>"
+                      "<|tool_call:begin|>0"
+                      "<|tool_call:name|>special_function"
+                      "<|tool_call:args|>{\"arg1\":1}"
+                      "<|tool_call:end|>";
+
+            t.params.tools = {special_function_tool};
+            t.params.tool_choice = COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+            t.expect = message_assist_thoughts_call_idx;
+        });
+
+        // Test tool call without reasoning and tool_choice = required
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "<|tool_calls|>"
+                      "<|tool_call:begin|>0"
+                      "<|tool_call:name|>special_function"
+                      "<|tool_call:args|>{\"arg1\":1}"
+                      "<|tool_call:end|>";
+
+            t.params.tools = {special_function_tool};
+            t.params.tool_choice = COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+            t.params.chat_template_kwargs["reasoning_effort"] = "\"low\"";
+            t.expect = message_assist_call_idx;
+        });
+
+        // Test parallel tool calls
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "<|think|>I'm\nthinking<|end|>"
+                      "<|begin|>assistant<|tool_calls|>"
+                      "<|tool_call:begin|>0"
+                      "<|tool_call:name|>special_function"
+                      "<|tool_call:args|>{\"arg1\":1}"
+                      "<|tool_call:end|>"
+                      "<|tool_call:begin|>1"
+                      "<|tool_call:name|>special_function_with_opt"
+                      "<|tool_call:args|>{\"arg1\": 1, \"arg2\": 2}"
+                      "<|tool_call:end|>";
+
+            t.params.parallel_tool_calls = true;
+            t.params.tools = {special_function_tool, special_function_tool_with_optional_param};
+
+            t.expect.reasoning_content = "I'm\nthinking";
+            t.expect.tool_calls = {{
+                /* .name = */      "special_function",
+                /* .arguments = */ R"({"arg1": 1})",
+                /* .id = */        "0",
+            }, {
+                /* .name = */      "special_function_with_opt",
+                /* .arguments = */ R"({"arg1": 1, "arg2": 2})",
+                /* .id = */        "1",
+            }};
+        });
+
+        // Test response format
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "<|think|>I need to output the invoice details in JSON<|end|>"
+                      "<|begin|>assistant<|content|>"
+                      R"({"amount": 123.45, "date": "2025-12-03"})";
+
+            t.params.json_schema = invoice_schema;
+
+            t.expect.reasoning_content = "I need to output the invoice details in JSON";
+            t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
+        });
+
+        // Test response format no reasoning
+        test_peg_parser(tmpls.get(), [&](auto & t) {
+            t.input = "<|content|>"
+                      R"({"amount": 123.45, "date": "2025-12-03"})";
+
+            t.params.chat_template_kwargs["reasoning_effort"] = "\"low\"";
+            t.params.json_schema = invoice_schema;
+
+            t.expect.content =R"({"amount": 123.45, "date": "2025-12-03"})";
+        });
+    }
+}
+
+static void test_msg_diffs_compute() {
+    printf("[%s]\n", __func__);
+    {
+        common_chat_msg msg1;
+
+        common_chat_msg msg2;
+        msg2.content = "Hello, world!";
+
+        common_chat_msg_diff diff;
+        diff.content_delta = "Hello, world!";
+
+        assert_equals(
+            {diff},
+            common_chat_msg_diff::compute_diffs(msg1, msg2));
+    }
+    {
+        common_chat_msg msg1;
+        msg1.content = "Hello,";
+
+        common_chat_msg msg2;
+        msg2.content = "Hello, world!";
+
+        common_chat_msg_diff diff;
+        diff.content_delta = " world!";
+
+        assert_equals(
+            {diff},
+            common_chat_msg_diff::compute_diffs(msg1, msg2));
+    }
+    {
+        common_chat_msg msg0;
+
+        common_chat_msg msg1;
+        msg1.tool_calls = { { "special_function", "{\"ar", /* .id = */ "123" } };
+
+        common_chat_msg msg2;
+        msg2.tool_calls = { { "special_function", "{\"arg1\": 1}", /* .id = */ "123" } };
+
+        common_chat_msg_diff diff01;
+        diff01.tool_call_index = 0;
+        diff01.tool_call_delta.name = "special_function";
+        diff01.tool_call_delta.id = "123";
+        diff01.tool_call_delta.arguments = "{\"ar";
+
+        assert_equals(
+            {diff01},
+            common_chat_msg_diff::compute_diffs(msg0, msg1));
+
+        common_chat_msg_diff diff12;
+        diff12.tool_call_index = 0;
+        // Note: neither id nor name change here.
+        diff12.tool_call_delta.arguments = "g1\": 1}";
+
+        assert_equals(
+            {diff12},
+            common_chat_msg_diff::compute_diffs(msg1, msg2));
+    }
+    {
+        common_chat_msg msg0;
+
+        common_chat_msg msg2;
+        msg2.tool_calls = {
+            { "f1", "{\"arg1\": 1}", /* .id = */ "123" },
+            { "f2", "{\"arg2\": 2}", /* .id = */ "222" },
+        };
+
+        common_chat_msg_diff diff1;
+        diff1.tool_call_index = 0;
+        diff1.tool_call_delta.name = "f1";
+        diff1.tool_call_delta.id = "123";
+        diff1.tool_call_delta.arguments = "{\"arg1\": 1}";
+
+        common_chat_msg_diff diff2;
+        diff2.tool_call_index = 1;
+        diff2.tool_call_delta.name = "f2";
+        diff2.tool_call_delta.id = "222";
+        diff2.tool_call_delta.arguments = "{\"arg2\": 2}";
+
+        assert_equals(
+            {diff1, diff2},
+            common_chat_msg_diff::compute_diffs(msg0, msg2));
+    }
+}
+
+int main(int argc, char ** argv) {
+    common_log_set_verbosity_thold(999);
+
+    // try {
+#ifndef _WIN32
+        if (argc > 1) {
+            common_chat_templates_inputs inputs;
+            common_chat_msg msg;
+            msg.role = "user";
+            msg.content = "Hey";
+            inputs.messages = {msg};
+            inputs.tools = { special_function_tool };
+
+            std::cout << "| Template | Format |\n";
+            std::cout << "|----------|--------|\n";
+
+            for (int i = 1; i < argc; i++) {
+                try {
+                    std::string path = argv[i];
+                    if (path.rfind(".jinja") != path.size() - 6) {
+                        std::cerr << "Skipping non-jinja file: " << path << '\n';
+                        continue;
+                    }
+                    auto tmpls = read_templates(path);
+                    auto parts  = string_split(path, "/");
+                    auto name   = parts[parts.size() - 1];
+                    auto format = common_chat_format_name(common_chat_templates_apply(tmpls.get(), inputs).format);
+                    std::cout << "| " << name << " | " << format << " |\n";
+                } catch (const std::exception & e) {
+                    std::cerr << "Failed to process " << argv[i] << ": " << e.what() << '\n';
+                }
+            }
+        } else
+#endif
+        {
+            test_msg_diffs_compute();
+            test_msgs_oaicompat_json_conversion();
+            test_tools_oaicompat_json_conversion();
+            test_template_output_parsers();
+            test_template_output_peg_parsers();
+            std::cout << "\n[chat] All tests passed!" << '\n';
+        }
+        return 0;
+    // } catch (const std::exception & e) {
+    //     std::cerr << "Error: " << e.what() << '\n';
+    //     return 1;
+    // }
+}
diff --git a/llama.cpp/tests/test-double-float.cpp b/llama.cpp/tests/test-double-float.cpp
new file mode 100644
index 0000000..6aac473
--- /dev/null
+++ b/llama.cpp/tests/test-double-float.cpp
@@ -0,0 +1,57 @@
+// These tests may take a long time!
+// They are to prove that conversion from double to float of various functions in ggml.c doesn't affect the result.
+// This is done by checking all finite (non-NaN, non-infinite) floats.
+
+#undef NDEBUG
+#include <cassert>
+#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
+#include <immintrin.h>
+#endif
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+
+// ggml.c::quantize_row_q4_0_ref
+inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
+
+// ggml.c::ggml_silu_f32
+inline static float silu_orig(float x) {
+    return x/(1.0 + exp(-x));
+}
+
+#pragma GCC diagnostic pop
+
+// ggml.c::quantize_row_q4_0_ref
+inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
+
+// ggml.c::ggml_silu_f32
+inline static float silu_float(float x) {
+    return x/(1.0f + expf(-x));
+}
+
+int main(void) {
+    uint32_t x = UINT32_MAX;
+    do {
+        float f;
+        memcpy(&f, &x, sizeof(x));
+        assert(!std::isfinite(f) || (round_orig(f) == round_float(f)));
+    } while (x--);
+
+#ifdef __F16C__
+    // GELU and SILU implementations are used with a FP16 lookup table.
+    // The original and float-only results are not equal for all inputs after converting to FP16.
+    // GELU is an approximation anyway (tanh), not tested here.
+    // For SILU, verify that the results are at least the closest floating point numbers, if the FP16 values don't match.
+    for (x = 0; x <= UINT16_MAX; x++) {
+        float f = _cvtsh_ss(x);
+        const float so = silu_orig(f);
+        const float sf = silu_float(f);
+        assert(   (_cvtss_sh(so, 0) == _cvtss_sh(sf, 0))
+               || (nextafterf(so, sf) == sf)
+               || (nextafterf(sf, so) == so));
+    }
+#endif
+}
diff --git a/llama.cpp/tests/test-gbnf-validator.cpp b/llama.cpp/tests/test-gbnf-validator.cpp
new file mode 100644
index 0000000..6547eec
--- /dev/null
+++ b/llama.cpp/tests/test-gbnf-validator.cpp
@@ -0,0 +1,109 @@
+#include "../src/unicode.h"
+#include "../src/llama-grammar.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <sstream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
+    const auto cpts = unicode_cpts_from_utf8(input_str);
+
+    auto & stacks_cur = llama_grammar_get_stacks(grammar);
+
+    size_t pos = 0;
+    for (const auto & cpt : cpts) {
+        llama_grammar_accept(grammar, cpt);
+
+        if (stacks_cur.empty()) {
+            error_pos = pos;
+            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
+            return false;
+        }
+        ++pos;
+    }
+
+    for (const auto & stack : stacks_cur) {
+        if (stack.empty()) {
+            return true;
+        }
+    }
+
+    error_pos = pos;
+    error_msg = "Unexpected end of input";
+    return false;
+}
+
+static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) {
+    fprintf(stdout, "Input string is invalid according to the grammar.\n");
+    fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "Input string:\n");
+    fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str());
+    if (error_pos < input_str.size()) {
+        fprintf(stdout, "\033[1;31m%c", input_str[error_pos]);
+        if (error_pos+1 < input_str.size()) {
+            fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str());
+        }
+        fprintf(stdout, "\033[0m\n");
+    }
+}
+
+int main(int argc, char** argv) {
+    if (argc != 3) {
+        fprintf(stdout, "Usage: %s <grammar_filename> <input_filename>\n", argv[0]);
+        return 1;
+    }
+
+    const std::string grammar_filename = argv[1];
+    const std::string input_filename = argv[2];
+
+    // Read the GBNF grammar file
+    FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
+    if (!grammar_file) {
+        fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
+        return 1;
+    }
+
+    std::string grammar_str;
+    {
+        std::ifstream grammar_file(grammar_filename);
+        GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
+        std::stringstream buffer;
+        buffer << grammar_file.rdbuf();
+        grammar_str = buffer.str();
+    }
+
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
+    if (grammar == nullptr) {
+        fprintf(stdout, "Failed to initialize llama_grammar\n");
+        return 1;
+    }
+    // Read the input file
+    std::string input_str;
+    {
+        std::ifstream input_file(input_filename);
+        GGML_ASSERT(input_file.is_open() && "Failed to open input file");
+        std::stringstream buffer;
+        buffer << input_file.rdbuf();
+        input_str = buffer.str();
+    }
+
+    // Validate the input string against the grammar
+    size_t error_pos;
+    std::string error_msg;
+    bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
+
+    if (is_valid) {
+        fprintf(stdout, "Input string is valid according to the grammar.\n");
+    } else {
+        print_error_message(input_str, error_pos, error_msg);
+    }
+
+    // Clean up
+    llama_grammar_free_impl(grammar);
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-gguf.cpp b/llama.cpp/tests/test-gguf.cpp
new file mode 100644
index 0000000..84b7f3b
--- /dev/null
+++ b/llama.cpp/tests/test-gguf.cpp
@@ -0,0 +1,1355 @@
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "../ggml/src/ggml-impl.h"
+#include "gguf.h"
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <random>
+#include <string>
+#include <vector>
+
+constexpr int offset_has_kv      = 1000;
+constexpr int offset_has_tensors = 2000;
+constexpr int offset_has_data    = 3000;
+
+enum handcrafted_file_type {
+    HANDCRAFTED_HEADER_BAD_MAGIC           =  10,
+    HANDCRAFTED_HEADER_BAD_VERSION_0       =  15,
+    HANDCRAFTED_HEADER_BAD_VERSION_1       =  20,
+    HANDCRAFTED_HEADER_BAD_VERSION_FUTURE  =  30,
+    HANDCRAFTED_HEADER_BAD_N_TENSORS       =  40,
+    HANDCRAFTED_HEADER_BAD_N_KV            =  50,
+    HANDCRAFTED_HEADER_EMPTY               = 800,
+
+    HANDCRAFTED_KV_BAD_KEY_SIZE            =  10 + offset_has_kv,
+    HANDCRAFTED_KV_BAD_TYPE                =  20 + offset_has_kv,
+    // HANDCRAFTED_KV_BAD_VALUE_SIZE          =  30 + offset_has_kv, // removed because it can result in allocations > 1 TB (default sanitizer limit)
+    HANDCRAFTED_KV_DUPLICATE_KEY           =  40 + offset_has_kv,
+    HANDCRAFTED_KV_BAD_ALIGN               =  50 + offset_has_kv,
+    HANDCRAFTED_KV_SUCCESS                 = 800 + offset_has_kv,
+
+    HANDCRAFTED_TENSORS_BAD_NAME_SIZE      =  10 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_N_DIMS         =  20 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_SHAPE          =  30 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_NE_TOO_BIG         =  40 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_NBYTES_TOO_BIG     =  45 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_TYPE           =  50 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_OFFSET         =  60 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_DUPLICATE_NAME     =  70 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_BAD_ALIGN          =  75 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN =  80 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_SUCCESS            = 800 + offset_has_tensors,
+    HANDCRAFTED_TENSORS_CUSTOM_ALIGN       = 810 + offset_has_tensors,
+
+    HANDCRAFTED_DATA_NOT_ENOUGH_DATA       =  10 + offset_has_data,
+    HANDCRAFTED_DATA_BAD_ALIGN             =  15 + offset_has_data,
+    HANDCRAFTED_DATA_INCONSISTENT_ALIGN    =  20 + offset_has_data,
+    HANDCRAFTED_DATA_SUCCESS               = 800 + offset_has_data,
+    HANDCRAFTED_DATA_CUSTOM_ALIGN          = 810 + offset_has_data,
+};
+
+static std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
+    switch (hft) {
+        case HANDCRAFTED_HEADER_BAD_MAGIC:           return "HEADER_BAD_MAGIC";
+        case HANDCRAFTED_HEADER_BAD_VERSION_0:       return "HEADER_BAD_VERSION_0";
+        case HANDCRAFTED_HEADER_BAD_VERSION_1:       return "HEADER_BAD_VERSION_1";
+        case HANDCRAFTED_HEADER_BAD_VERSION_FUTURE:  return "HEADER_BAD_VERSION_FUTURE";
+        case HANDCRAFTED_HEADER_BAD_N_KV:            return "HEADER_BAD_N_KV";
+        case HANDCRAFTED_HEADER_BAD_N_TENSORS:       return "HEADER_BAD_N_TENSORS";
+        case HANDCRAFTED_HEADER_EMPTY:               return "HEADER_EMPTY";
+
+        case HANDCRAFTED_KV_BAD_KEY_SIZE:            return "KV_BAD_KEY_SIZE";
+        case HANDCRAFTED_KV_BAD_TYPE:                return "KV_BAD_TYPE";
+        case HANDCRAFTED_KV_DUPLICATE_KEY:           return "KV_DUPLICATE_KEY";
+        case HANDCRAFTED_KV_BAD_ALIGN:               return "KV_BAD_ALIGN";
+        case HANDCRAFTED_KV_SUCCESS:                 return "KV_RANDOM_KV";
+
+        case HANDCRAFTED_TENSORS_BAD_NAME_SIZE:      return "TENSORS_BAD_NAME_SIZE";
+        case HANDCRAFTED_TENSORS_BAD_N_DIMS:         return "TENSORS_BAD_N_DIMS";
+        case HANDCRAFTED_TENSORS_BAD_SHAPE:          return "TENSORS_BAD_SHAPE";
+        case HANDCRAFTED_TENSORS_NE_TOO_BIG:         return "TENSORS_NE_TOO_BIG";
+        case HANDCRAFTED_TENSORS_NBYTES_TOO_BIG:     return "TENSORS_NBYTES_TOO_BIG";
+        case HANDCRAFTED_TENSORS_BAD_TYPE:           return "TENSORS_BAD_TYPE";
+        case HANDCRAFTED_TENSORS_BAD_OFFSET:         return "TENSORS_BAD_OFFSET";
+        case HANDCRAFTED_TENSORS_DUPLICATE_NAME:     return "TENSORS_DUPLICATE_NAME";
+        case HANDCRAFTED_TENSORS_BAD_ALIGN:          return "TENSORS_BAD_ALIGN";
+        case HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN: return "TENSORS_INCONSISTENT_ALIGN";
+        case HANDCRAFTED_TENSORS_SUCCESS:            return "TENSORS_SUCCESS";
+        case HANDCRAFTED_TENSORS_CUSTOM_ALIGN:       return "TENSORS_CUSTOM_ALIGN";
+
+        case HANDCRAFTED_DATA_NOT_ENOUGH_DATA:       return "DATA_NOT_ENOUGH_DATA";
+        case HANDCRAFTED_DATA_BAD_ALIGN:             return "DATA_BAD_ALIGN";
+        case HANDCRAFTED_DATA_INCONSISTENT_ALIGN:    return "DATA_INCONSISTENT_ALIGN";
+        case HANDCRAFTED_DATA_SUCCESS:               return "DATA_SUCCESS";
+        case HANDCRAFTED_DATA_CUSTOM_ALIGN:          return "DATA_CUSTOM_ALIGN";
+    }
+    GGML_ABORT("fatal error");
+}
+
+static bool expect_context_not_null(const enum handcrafted_file_type hft) {
+    if (hft < offset_has_kv) {
+        return hft >= HANDCRAFTED_HEADER_EMPTY;
+    }
+    if (hft < offset_has_tensors) {
+        return hft >= HANDCRAFTED_KV_SUCCESS;
+    }
+    if (hft < offset_has_data) {
+        return hft >= HANDCRAFTED_TENSORS_SUCCESS;
+    }
+    return hft >= HANDCRAFTED_DATA_SUCCESS;
+}
+
+typedef std::pair<enum ggml_type, std::array<int64_t, GGML_MAX_DIMS>> tensor_config_t;
+
+static std::vector<tensor_config_t> get_tensor_configs(std::mt19937 & rng) {
+    std::vector<tensor_config_t> tensor_configs;
+    tensor_configs.reserve(100);
+
+    for (int i = 0; i < 100; ++i) {
+        const enum ggml_type type = ggml_type(rng() % GGML_TYPE_COUNT);
+        if (ggml_type_size(type) == 0) {
+            continue;
+        }
+
+        std::array<int64_t, GGML_MAX_DIMS> shape = {1, 1, 1, 1};
+        shape[0] = (1 + rng() % 10) * ggml_blck_size(type);
+        const int n_dims = 1 + rng() % GGML_MAX_DIMS;
+        for (int i = 1; i < n_dims; ++i) {
+            shape[i] = 1 + rng() % 10;
+        }
+
+        tensor_configs.push_back(std::make_pair(type, shape));
+    }
+
+    return tensor_configs;
+}
+
+static std::vector<std::pair<enum gguf_type, enum gguf_type>> get_kv_types(std::mt19937 rng) {
+    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
+    kv_types.reserve(100);
+
+    for (int i = 0; i < 100; ++i) {
+        const gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT);
+
+        if (type == GGUF_TYPE_ARRAY) {
+            const gguf_type type_arr = gguf_type(rng() % GGUF_TYPE_COUNT);
+            if (type_arr == GGUF_TYPE_ARRAY) {
+                continue;
+            }
+            kv_types.push_back(std::make_pair(type, type_arr));
+            continue;
+        }
+
+        kv_types.push_back(std::make_pair(type, gguf_type(-1)));
+    }
+    std::shuffle(kv_types.begin(), kv_types.end(), rng);
+
+    return kv_types;
+}
+
+template <typename T>
+static void helper_write(FILE * file, const T & val) {
+    GGML_ASSERT(fwrite(&val, 1, sizeof(val), file) == sizeof(val));
+}
+
+static void helper_write(FILE * file, const void * data, const size_t nbytes) {
+    GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes);
+}
+
+static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) {
+    FILE * file = tmpfile();
+
+    if (!file) {
+        return file;
+    }
+
+    std::mt19937 rng(seed);
+    uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
+
+    if (hft == HANDCRAFTED_HEADER_BAD_MAGIC) {
+        const char bad_magic[4] = {'F', 'U', 'G', 'G'};
+        helper_write(file, bad_magic, sizeof(bad_magic));
+    } else {
+        helper_write(file, GGUF_MAGIC, 4);
+    }
+
+    if (hft == HANDCRAFTED_HEADER_BAD_VERSION_0) {
+        const uint32_t version = 0;
+        helper_write(file, version);
+    } else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
+        const uint32_t version = 1;
+        helper_write(file, version);
+    } else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_FUTURE) {
+        const uint32_t version = GGUF_VERSION + 1;
+        helper_write(file, version);
+    } else {
+        const uint32_t version = GGUF_VERSION;
+        helper_write(file, version);
+    }
+
+    std::vector<tensor_config_t> tensor_configs;
+    if (hft >= offset_has_tensors) {
+        tensor_configs = get_tensor_configs(rng);
+    }
+
+    if (hft == HANDCRAFTED_HEADER_BAD_N_TENSORS) {
+        const uint64_t n_tensors = -1;
+        helper_write(file, n_tensors);
+    } else {
+        const uint64_t n_tensors = tensor_configs.size();
+        helper_write(file, n_tensors);
+    }
+
+    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
+    if (hft >= offset_has_kv) {
+        kv_types = get_kv_types(rng);
+    }
+    {
+        uint64_t n_kv = kv_types.size();
+        if (hft == HANDCRAFTED_KV_BAD_ALIGN      ||
+            hft == HANDCRAFTED_TENSORS_BAD_ALIGN || hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN ||
+            hft == HANDCRAFTED_DATA_BAD_ALIGN    || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
+
+            n_kv += 1;
+        } else if (hft == HANDCRAFTED_HEADER_BAD_N_KV) {
+            n_kv = -1;
+        }
+        helper_write(file, n_kv);
+    }
+
+    if (hft < offset_has_kv) {
+        while (ftell(file) % alignment != 0) {
+            const char pad = 0;
+            helper_write(file, pad);
+        }
+
+        for (int i = 0; i < extra_bytes; ++i) {
+            const char tmp = 0;
+            helper_write(file, tmp);
+        }
+        rewind(file);
+        return file;
+    }
+
+    for (int i = 0; i < int(kv_types.size()); ++i) {
+        const enum gguf_type type     = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? GGUF_TYPE_COUNT : kv_types[i].first);
+        const enum gguf_type type_arr = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? GGUF_TYPE_COUNT : kv_types[i].second);
+
+        const std::string key = "my_key_" + std::to_string((hft == HANDCRAFTED_KV_DUPLICATE_KEY ? i/2 : i));
+
+        if (hft == HANDCRAFTED_KV_BAD_KEY_SIZE) {
+            const uint64_t n = -1;
+            helper_write(file, n);
+        } else {
+            const uint64_t n = key.length();
+            helper_write(file, n);
+        }
+        helper_write(file, key.data(), key.length());
+
+        {
+            const int32_t type32 = int32_t(type);
+            helper_write(file, type32);
+        }
+
+        uint32_t data[16];
+        for (int j = 0; j < 16; ++j) {
+            data[j] = rng();
+            if (type == GGUF_TYPE_STRING || type_arr == GGUF_TYPE_STRING) {
+                data[j] |= 0x01010101; // avoid random null-termination of string
+            }
+        }
+
+        if (type == GGUF_TYPE_STRING) {
+            const uint64_t n = rng() % sizeof(data);
+            helper_write(file, n);
+            helper_write(file, data, n);
+            continue;
+        }
+
+        if (type == GGUF_TYPE_ARRAY) {
+            {
+                const int32_t type32 = int32_t(type_arr);
+                helper_write(file, type32);
+            }
+            if (type_arr == GGUF_TYPE_STRING) {
+                const uint64_t nstr = rng() % (16 + 1);
+                helper_write(file, nstr);
+                for (uint64_t istr = 0; istr < nstr; ++istr) {
+                    const uint64_t n = rng() % (sizeof(uint32_t) + 1);
+                    helper_write(file, n);
+                    helper_write(file, &data[istr], n);
+                }
+                continue;
+            }
+            const size_t type_size = gguf_type_size(type_arr);
+            const uint64_t n = (rng() % sizeof(data)) / type_size;
+            helper_write(file, n);
+            helper_write(file, &data, n*type_size);
+            continue;
+        }
+
+        helper_write(file, data, hft == HANDCRAFTED_KV_BAD_TYPE ? 1 : gguf_type_size(type));
+    }
+
+    if (hft == HANDCRAFTED_KV_BAD_ALIGN      ||
+        hft == HANDCRAFTED_TENSORS_BAD_ALIGN || hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN ||
+        hft == HANDCRAFTED_DATA_BAD_ALIGN    || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) {
+
+        const uint64_t n = strlen(GGUF_KEY_GENERAL_ALIGNMENT);
+        helper_write(file, n);
+        helper_write(file, GGUF_KEY_GENERAL_ALIGNMENT, n);
+
+        const int32_t type = gguf_type(GGUF_TYPE_UINT32);
+        helper_write(file, type);
+
+        alignment = expect_context_not_null(hft) ? 1 : 13;
+        helper_write(file, alignment);
+    }
+
+    if (hft < offset_has_tensors) {
+        while (ftell(file) % alignment != 0) {
+            const char pad = 0;
+            helper_write(file, pad);
+        }
+
+        for (int i = 0; i < extra_bytes; ++i) {
+            const char tmp = 0;
+            helper_write(file, tmp);
+        }
+        rewind(file);
+        return file;
+    }
+
+    if (hft == HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN || hft == HANDCRAFTED_DATA_INCONSISTENT_ALIGN) {
+        alignment = 1;
+    }
+
+    uint64_t offset = 0;
+    for (int i = 0; i < int(tensor_configs.size()); ++i) {
+        const ggml_type                          type  = hft == HANDCRAFTED_TENSORS_NBYTES_TOO_BIG ? GGML_TYPE_I64 : tensor_configs[i].first;
+        const std::array<int64_t, GGML_MAX_DIMS> shape = tensor_configs[i].second;
+
+        std::string name = "my_tensor";
+        if (hft != HANDCRAFTED_TENSORS_DUPLICATE_NAME) {
+            name += "_" + std::to_string(i);
+        }
+        if (hft == HANDCRAFTED_TENSORS_BAD_NAME_SIZE) {
+            name += "_with_a_very_long_name_which_is_longer_than_what_is_allowed_for_ggml_tensors";
+            GGML_ASSERT(name.length() >= GGML_MAX_NAME);
+        }
+        {
+            const uint64_t n = name.length();
+            helper_write(file, n);
+        }
+        helper_write(file, name.data(), name.length());
+
+        uint32_t n_dims = (hft == HANDCRAFTED_TENSORS_NE_TOO_BIG || hft == HANDCRAFTED_TENSORS_NBYTES_TOO_BIG) ? 2 : 1;
+        for (int i = GGML_MAX_DIMS-1; i >= 1; --i) {
+            if (shape[i] != 1) {
+                n_dims = i + 1;
+                break;
+            }
+        }
+        if (hft == HANDCRAFTED_TENSORS_BAD_N_DIMS) {
+            const uint32_t n_dims_bad = GGML_MAX_DIMS + 1;
+            helper_write(file, n_dims_bad);
+        } else {
+            helper_write(file, n_dims);
+        }
+
+        if (hft == HANDCRAFTED_TENSORS_BAD_SHAPE) {
+            const int64_t bad_dim = -1;
+            for (uint32_t j = 0; j < n_dims; ++j) {
+                helper_write(file, bad_dim);
+            }
+        } else if (hft == HANDCRAFTED_TENSORS_NE_TOO_BIG){
+            const int64_t big_dim = 4*int64_t(INT32_MAX);
+            for (uint32_t j = 0; j < n_dims; ++j) {
+                helper_write(file, big_dim);
+            }
+        } else if (hft == HANDCRAFTED_TENSORS_NBYTES_TOO_BIG){
+            const size_t  big_ne  = SIZE_MAX/ggml_type_size(type);
+            const int64_t big_dim = GGML_PAD(int64_t(1.01f*std::pow(big_ne, 1.0f/n_dims)) + 1, ggml_blck_size(type));
+            for (uint32_t j = 0; j < n_dims; ++j) {
+                helper_write(file, big_dim);
+            }
+        } else {
+            helper_write(file, shape.data(), n_dims*sizeof(int64_t));
+        }
+
+        {
+            const int32_t type32 = hft == HANDCRAFTED_TENSORS_BAD_TYPE ? GGML_TYPE_COUNT : int32_t(type);
+            helper_write(file, type32);
+        }
+
+        if (hft == HANDCRAFTED_TENSORS_BAD_OFFSET) {
+            const uint64_t bad_offset = -1;
+            helper_write(file, bad_offset);
+        } else {
+            helper_write(file, offset);
+        }
+
+        int64_t ne = shape[0];
+        for (uint32_t i = 1; i < n_dims; ++i) {
+            ne *= shape[i];
+        }
+        offset += GGML_PAD(ggml_row_size(type, ne), alignment);
+    }
+
+    while (ftell(file) % alignment != 0) {
+        const char pad = 0;
+        helper_write(file, pad);
+    }
+
+    if (hft >= offset_has_data) {
+        rng.seed(seed + 1);
+        uint64_t nbytes = offset;
+        if (hft == HANDCRAFTED_DATA_NOT_ENOUGH_DATA) {
+            nbytes -= 1;
+        }
+        for (uint64_t i = 0; i < nbytes; ++i) {
+            const uint8_t random_byte = i % 256;
+            helper_write(file, random_byte);
+        }
+    }
+
+    for (int i = 0; i < extra_bytes; ++i) {
+        const char tmp = 0;
+        helper_write(file, tmp);
+    }
+    rewind(file);
+    return file;
+}
+
+static bool handcrafted_check_header(const gguf_context * gguf_ctx, const unsigned int seed, const bool has_kv, const bool has_tensors, const bool alignment_defined) {
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    std::mt19937 rng(seed);
+
+    std::vector<tensor_config_t> tensor_configs;
+    if (has_tensors) {
+        tensor_configs = get_tensor_configs(rng);
+    }
+    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
+    if (has_kv) {
+        kv_types = get_kv_types(rng);
+    }
+
+    bool ok = true;
+
+    if (gguf_get_version(gguf_ctx) != GGUF_VERSION) {
+        ok = false;
+    }
+    if (gguf_get_n_tensors(gguf_ctx) != int(tensor_configs.size())) {
+        ok = false;
+    }
+    if (gguf_get_n_kv(gguf_ctx) != int(alignment_defined ? kv_types.size() + 1 : kv_types.size())) {
+        ok = false;
+    }
+
+    return ok;
+}
+
+static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned int seed, const bool has_tensors, const bool alignment_defined) {
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    std::mt19937 rng(seed);
+
+    std::vector<tensor_config_t> tensor_configs;
+    if (has_tensors) {
+        tensor_configs = get_tensor_configs(rng);
+    }
+
+    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types = get_kv_types(rng);
+
+    bool ok = true;
+
+    for (int i = 0; i < int(kv_types.size()); ++i) {
+        const enum gguf_type type     = gguf_type(kv_types[i].first);
+        const enum gguf_type type_arr = gguf_type(kv_types[i].second);
+
+        const std::string key = "my_key_" + std::to_string(i);
+
+        uint32_t data[16];
+        for (int j = 0; j < 16; ++j) {
+            data[j] = rng();
+            if (type == GGUF_TYPE_STRING || type_arr == GGUF_TYPE_STRING) {
+                data[j] |= 0x01010101; // avoid random null-termination of string
+            }
+        }
+
+        const char * data8 = reinterpret_cast<const char *>(data);
+        const int id = gguf_find_key(gguf_ctx, key.c_str());
+
+        if (type == GGUF_TYPE_STRING) {
+            const char * str = gguf_get_val_str(gguf_ctx, id);
+            const uint64_t n = strlen(str);
+            const uint64_t n_expected = rng() % sizeof(data);
+            if (n != n_expected) {
+                ok = false;
+                continue;
+            }
+            if (!std::equal(str, str + n, data8)) {
+                ok = false;
+            }
+            continue;
+        }
+
+        if (type == GGUF_TYPE_ARRAY) {
+            const size_t type_size = gguf_type_size(type_arr);
+            const uint64_t arr_n = gguf_get_arr_n(gguf_ctx, id);
+
+            if (type_arr == GGUF_TYPE_STRING) {
+                const uint64_t nstr_expected = rng() % (16 + 1);
+                if (arr_n != nstr_expected) {
+                    ok = false;
+                    continue;
+                }
+                for (uint64_t istr = 0; istr < nstr_expected; ++istr) {
+                    const char * str = gguf_get_arr_str(gguf_ctx, id, istr);
+                    const uint64_t n = strlen(str);
+                    const uint64_t n_expected = rng() % (sizeof(uint32_t) + 1);
+
+                    if (n != n_expected) {
+                        ok = false;
+                        continue;
+                    }
+                    const char * str_expected = reinterpret_cast<const char *>(&data[istr]);
+                    if (strncmp(str, str_expected, n) != 0) {
+                        ok = false;
+                        continue;
+                    }
+                }
+                continue;
+            }
+
+            const uint64_t arr_n_expected = (rng() % sizeof(data)) / type_size;
+            if (arr_n != arr_n_expected) {
+                ok = false;
+                continue;
+            }
+
+            const char * data_gguf = reinterpret_cast<const char *>(gguf_get_arr_data(gguf_ctx, id));
+
+            if (type_arr == GGUF_TYPE_BOOL) {
+                for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
+                    if (bool(data8[arr_i]) != bool(data_gguf[arr_i])) {
+                        ok = false;
+                    }
+                }
+                continue;
+            }
+
+            if (!std::equal(data8, data8 + arr_n*type_size, data_gguf)) {
+                ok = false;
+            }
+            continue;
+        }
+
+        const char * data_gguf = reinterpret_cast<const char *>(gguf_get_val_data(gguf_ctx, id));
+
+        if (type == GGUF_TYPE_BOOL) {
+            if (bool(*data8) != bool(*data_gguf)) {
+                ok = false;
+            }
+            continue;
+        }
+
+        if (!std::equal(data8, data8 + gguf_type_size(type), data_gguf)) {
+            ok = false;
+        }
+    }
+
+    const uint32_t expected_alignment = alignment_defined ? 1 : GGUF_DEFAULT_ALIGNMENT;
+    if (gguf_get_alignment(gguf_ctx) != expected_alignment) {
+        ok = false;
+    }
+
+    return ok;
+}
+
+static bool handcrafted_check_tensors(const gguf_context * gguf_ctx, const unsigned int seed) {
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    std::mt19937 rng(seed);
+
+    std::vector<tensor_config_t> tensor_configs = get_tensor_configs(rng);
+
+    // Call get_kv_types to get the same RNG state:
+    get_kv_types(rng);
+
+    bool ok = true;
+
+    const int id_alignment = gguf_find_key(gguf_ctx, GGUF_KEY_GENERAL_ALIGNMENT);
+    const uint32_t alignment = id_alignment >= 0 ? gguf_get_val_u32(gguf_ctx, id_alignment) : GGUF_DEFAULT_ALIGNMENT;
+
+    uint64_t expected_offset = 0;
+    for (int i = 0; i < int(tensor_configs.size()); ++i) {
+        const ggml_type                          type  = tensor_configs[i].first;
+        const std::array<int64_t, GGML_MAX_DIMS> shape = tensor_configs[i].second;
+
+        const std::string name = "my_tensor_" + std::to_string(i);
+        const int id = gguf_find_tensor(gguf_ctx, name.c_str());
+
+        if (id >= 0) {
+            if (std::string(gguf_get_tensor_name(gguf_ctx, id)) != name) {
+                ok = false;
+            }
+
+            if (gguf_get_tensor_type(gguf_ctx, id) != type) {
+                ok = false;
+            }
+        } else {
+            ok = false;
+            continue;
+        }
+
+        const size_t offset = gguf_get_tensor_offset(gguf_ctx, id);
+
+        if (offset != expected_offset) {
+            ok = false;
+        }
+
+        int64_t ne = shape[0];
+        for (size_t j = 1; j < GGML_MAX_DIMS; ++j) {
+            ne *= shape[j];
+        }
+        expected_offset += GGML_PAD(ggml_row_size(type, ne), alignment);
+    }
+
+    return ok;
+}
+
+static bool handcrafted_check_tensor_data(const gguf_context * gguf_ctx, const unsigned int seed, FILE * file) {
+    if (!gguf_ctx) {
+        return false;
+    }
+
+    std::mt19937 rng(seed);
+
+    std::vector<tensor_config_t> tensor_configs = get_tensor_configs(rng);
+
+    bool ok = true;
+
+    for (int i = 0; i < int(tensor_configs.size()); ++i) {
+        const ggml_type                          type  = tensor_configs[i].first;
+        const std::array<int64_t, GGML_MAX_DIMS> shape = tensor_configs[i].second;
+
+        int64_t ne = shape[0];
+        for (size_t j = 1; j < GGML_MAX_DIMS; ++j) {
+            ne *= shape[j];
+        }
+        const size_t size = ggml_row_size(type, ne);
+
+        const std::string name = "my_tensor_" + std::to_string(i);
+        const size_t offset = gguf_get_tensor_offset(gguf_ctx, gguf_find_tensor(gguf_ctx, name.c_str()));
+
+        std::vector<uint8_t> data(size);
+        GGML_ASSERT(fseek(file, gguf_get_data_offset(gguf_ctx) + offset, SEEK_SET) == 0);
+        GGML_ASSERT(fread(data.data(), 1, data.size(), file) == data.size());
+
+        for (size_t j = 0; j < size; ++j) {
+            const uint8_t expected_byte = (j + offset) % 256;
+            if (data[j] != expected_byte) {
+                ok = false;
+            }
+        }
+    }
+
+    return ok;
+}
+
+static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
+    int npass = 0;
+    int ntest = 0;
+
+    const std::vector<handcrafted_file_type> hfts = {
+        HANDCRAFTED_HEADER_BAD_MAGIC,
+        HANDCRAFTED_HEADER_BAD_VERSION_0,
+        HANDCRAFTED_HEADER_BAD_VERSION_1,
+        HANDCRAFTED_HEADER_BAD_VERSION_FUTURE,
+        HANDCRAFTED_HEADER_BAD_N_KV,
+        HANDCRAFTED_HEADER_BAD_N_TENSORS,
+        HANDCRAFTED_HEADER_EMPTY,
+
+        HANDCRAFTED_KV_BAD_KEY_SIZE,
+        HANDCRAFTED_KV_BAD_TYPE,
+        HANDCRAFTED_KV_DUPLICATE_KEY,
+        HANDCRAFTED_KV_BAD_ALIGN,
+        HANDCRAFTED_KV_SUCCESS,
+
+        HANDCRAFTED_TENSORS_BAD_NAME_SIZE,
+        HANDCRAFTED_TENSORS_BAD_N_DIMS,
+        HANDCRAFTED_TENSORS_BAD_SHAPE,
+        HANDCRAFTED_TENSORS_NE_TOO_BIG,
+        HANDCRAFTED_TENSORS_NBYTES_TOO_BIG,
+        HANDCRAFTED_TENSORS_BAD_TYPE,
+        HANDCRAFTED_TENSORS_BAD_OFFSET,
+        HANDCRAFTED_TENSORS_DUPLICATE_NAME,
+        HANDCRAFTED_TENSORS_BAD_ALIGN,
+        HANDCRAFTED_TENSORS_INCONSISTENT_ALIGN,
+        HANDCRAFTED_TENSORS_SUCCESS,
+        HANDCRAFTED_TENSORS_CUSTOM_ALIGN,
+
+        HANDCRAFTED_DATA_NOT_ENOUGH_DATA,
+        HANDCRAFTED_DATA_BAD_ALIGN,
+        HANDCRAFTED_DATA_INCONSISTENT_ALIGN,
+        HANDCRAFTED_DATA_SUCCESS,
+        HANDCRAFTED_DATA_CUSTOM_ALIGN,
+    };
+
+    for (enum handcrafted_file_type hft : hfts) {
+        printf("%s: handcrafted_file_type=%s\n", __func__, handcrafted_file_type_name(hft).c_str());
+        FILE * file = get_handcrafted_file(seed, hft);
+
+#ifdef _WIN32
+        if (!file) {
+            printf("failed to create tmpfile(), needs elevated privileges on Windows");
+            printf("skipping tests");
+            continue;
+        }
+#else
+        GGML_ASSERT(file);
+#endif // _WIN32
+
+        struct ggml_context * ctx = nullptr;
+        struct gguf_init_params gguf_params = {
+            /*no_alloc =*/ false,
+            /*ctx      =*/ hft >= offset_has_data ? &ctx : nullptr,
+        };
+
+        struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
+
+        if (expect_context_not_null(hft)) {
+            printf("%s:   - context_not_null: ", __func__);
+        } else {
+            printf("%s:   - context_null: ", __func__);
+        }
+        if (bool(gguf_ctx) == expect_context_not_null(hft)) {
+            printf("\033[1;32mOK\033[0m\n");
+            npass++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+        ntest++;
+
+        if (hft >= offset_has_data && !expect_context_not_null(hft)) {
+            printf("%s:   - no_dangling_ggml_context_pointer: ", __func__);
+            if (ctx) {
+                printf("\033[1;31mFAIL\033[0m\n");
+            } else {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            }
+            ntest++;
+        }
+
+        const bool alignment_defined = hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN;
+
+        if (expect_context_not_null(hft)) {
+            printf("%s:   - check_header: ", __func__);
+            if (handcrafted_check_header(gguf_ctx, seed, hft >= offset_has_kv, hft >= offset_has_tensors, alignment_defined)) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+
+        if (expect_context_not_null(hft) && hft >= offset_has_kv) {
+            printf("%s:   - check_kv: ", __func__);
+            if (handcrafted_check_kv(gguf_ctx, seed, hft >= offset_has_tensors, alignment_defined)) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+
+        if (expect_context_not_null(hft) && hft >= offset_has_tensors) {
+            printf("%s:   - check_tensors: ", __func__);
+            if (handcrafted_check_tensors(gguf_ctx, seed)) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+
+        if (expect_context_not_null(hft) && hft >= offset_has_data) {
+            printf("%s:   - check_tensor_data: ", __func__);
+            if (handcrafted_check_tensor_data(gguf_ctx, seed, file)) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+
+        fclose(file);
+        if (gguf_ctx) {
+            ggml_free(ctx);
+            gguf_free(gguf_ctx);
+        }
+        printf("\n");
+    }
+
+
+    return std::make_pair(npass, ntest);
+}
+
+struct random_gguf_context_result {
+    struct gguf_context * gguf_ctx;
+    struct ggml_context * ctx;
+    ggml_backend_buffer_t buffer;
+};
+
+static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t backend, const unsigned int seed) {
+    std::mt19937 rng(seed);
+
+    struct gguf_context * gguf_ctx = gguf_init_empty();
+
+    for (int i = 0; i < 256; ++i) {
+        const std::string key = "my_key_" + std::to_string(rng() % 1024);
+        const enum gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT);
+
+        switch (type) {
+            case GGUF_TYPE_UINT8:   gguf_set_val_u8  (gguf_ctx, key.c_str(), rng() % (1 <<  7));             break;
+            case GGUF_TYPE_INT8:    gguf_set_val_i8  (gguf_ctx, key.c_str(), rng() % (1 <<  7) - (1 <<  6)); break;
+            case GGUF_TYPE_UINT16:  gguf_set_val_u16 (gguf_ctx, key.c_str(), rng() % (1 << 15));             break;
+            case GGUF_TYPE_INT16:   gguf_set_val_i16 (gguf_ctx, key.c_str(), rng() % (1 << 15) - (1 << 14)); break;
+            case GGUF_TYPE_UINT32:  gguf_set_val_u32 (gguf_ctx, key.c_str(), rng());                         break;
+            case GGUF_TYPE_INT32:   gguf_set_val_i32 (gguf_ctx, key.c_str(), rng()             - (1 << 30)); break;
+            case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (gguf_ctx, key.c_str(), rng() % 1024      - 512);       break;
+            case GGUF_TYPE_BOOL:    gguf_set_val_bool(gguf_ctx, key.c_str(), rng() % 2 == 0);                break;
+            case GGUF_TYPE_STRING:  gguf_set_val_str (gguf_ctx, key.c_str(), std::to_string(rng()).c_str()); break;
+            case GGUF_TYPE_UINT64:  gguf_set_val_u64 (gguf_ctx, key.c_str(), rng());                         break;
+            case GGUF_TYPE_INT64:   gguf_set_val_i64 (gguf_ctx, key.c_str(), rng()             - (1 << 30)); break;
+            case GGUF_TYPE_FLOAT64: gguf_set_val_f32 (gguf_ctx, key.c_str(), rng() % 1024      - 512);       break;
+            case GGUF_TYPE_ARRAY: {
+                const enum gguf_type type_arr = gguf_type(rng() % GGUF_TYPE_COUNT);
+                const uint64_t ne = rng() % 1024;
+
+                switch (type_arr) {
+                    case GGUF_TYPE_UINT8:
+                    case GGUF_TYPE_INT8:
+                    case GGUF_TYPE_UINT16:
+                    case GGUF_TYPE_INT16:
+                    case GGUF_TYPE_UINT32:
+                    case GGUF_TYPE_INT32:
+                    case GGUF_TYPE_FLOAT32:
+                    case GGUF_TYPE_BOOL:
+                    case GGUF_TYPE_UINT64:
+                    case GGUF_TYPE_INT64:
+                    case GGUF_TYPE_FLOAT64: {
+                        const size_t nbytes = ne*gguf_type_size(type_arr);
+                        std::vector<uint32_t> random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t));
+                        for (size_t j = 0; j < random_data.size(); ++j) {
+                            random_data[j] = rng();
+                            if (type_arr == GGUF_TYPE_BOOL) {
+                                random_data[j] &= 0x01010101; // the sanitizer complains if booleans are not 0 or 1
+                            }
+                        }
+                        gguf_set_arr_data(gguf_ctx, key.c_str(), type_arr, random_data.data(), ne);
+                    } break;
+                    case GGUF_TYPE_STRING: {
+                        std::vector<std::string>  data_cpp(ne);
+                        std::vector<const char *> data_c(ne);
+                        for (size_t j = 0; j < data_cpp.size(); ++j) {
+                            data_cpp[j] = std::to_string(rng());
+                            data_c[j]   = data_cpp[j].c_str();
+                        }
+                        gguf_set_arr_str(gguf_ctx, key.c_str(), data_c.data(), ne);
+                    } break;
+                    case GGUF_TYPE_ARRAY: {
+                        break; // not supported
+                    }
+                    case GGUF_TYPE_COUNT:
+                    default: {
+                        GGML_ABORT("fatal error");
+                    }
+                }
+            } break;
+            case GGUF_TYPE_COUNT:
+            default: {
+                GGML_ABORT("fatal error");
+            }
+        }
+    }
+
+    struct ggml_init_params ggml_params = {
+        /*.mem_size   =*/ 256*ggml_tensor_overhead(),
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    for (int i = 0; i < 256; ++i) {
+        const std::string name = "my_tensor_" + std::to_string(i);
+        const enum ggml_type type = ggml_type(rng() % GGML_TYPE_COUNT);
+        const size_t type_size = ggml_type_size(type);
+
+        if (type_size == 0) {
+            continue;
+        }
+
+        const int n_dims = 1 + rng() % GGML_MAX_DIMS;
+        int64_t ne[GGML_MAX_DIMS];
+        ne[0] = (1 + rng() % 10) * ggml_blck_size(type);
+        for (int j = 1; j < n_dims; ++j) {
+            ne[j] = 1 + rng() % 10;
+        }
+
+        struct ggml_tensor * tensor = ggml_new_tensor(ctx, type, n_dims, ne);
+        ggml_set_name(tensor, name.c_str());
+    }
+
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+        const size_t nbytes = ggml_nbytes(t);
+        std::vector<uint32_t> random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t));
+        for (size_t j = 0; j < random_data.size(); ++j) {
+            random_data[j] = rng();
+        }
+        ggml_backend_tensor_set(t, random_data.data(), 0, nbytes);
+
+        gguf_add_tensor(gguf_ctx, t);
+    }
+
+    return {gguf_ctx, ctx, buf};
+}
+
+static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other) {
+    bool ok = true;
+
+    const int n_kv = gguf_get_n_kv(ctx);
+    for (int id = 0; id < n_kv; ++id) {
+        const char * name = gguf_get_key(ctx, id);
+
+        const int idx_other = gguf_find_key(other, name);
+        if (idx_other < 0) {
+            ok = false;
+            continue;
+        }
+
+        const gguf_type type = gguf_get_kv_type(ctx, id);
+        if (type != gguf_get_kv_type(other, idx_other)) {
+            ok = false;
+            continue;
+        }
+
+        if (type == GGUF_TYPE_ARRAY) {
+            const size_t arr_n = gguf_get_arr_n(ctx, id);
+            if (arr_n != gguf_get_arr_n(other, idx_other)) {
+                ok = false;
+                continue;
+            }
+
+            const gguf_type type_arr = gguf_get_arr_type(ctx, id);
+            if (type_arr != gguf_get_arr_type(other, idx_other)) {
+                ok = false;
+                continue;
+            }
+
+            if (type_arr == GGUF_TYPE_BOOL) {
+                const int8_t * data       = reinterpret_cast<const int8_t *>(gguf_get_arr_data(ctx,   id));
+                const int8_t * data_other = reinterpret_cast<const int8_t *>(gguf_get_arr_data(other, idx_other));
+                for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
+                    if (bool(data[arr_i]) != bool(data_other[arr_i])) {
+                        ok = false;
+                    }
+                }
+                continue;
+            }
+
+            if (type_arr == GGUF_TYPE_STRING) {
+                for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
+                    const std::string str       = gguf_get_arr_str(ctx,   id,       arr_i);
+                    const std::string str_other = gguf_get_arr_str(other, idx_other, arr_i);
+                    if (str != str_other) {
+                        ok = false;
+                    }
+                }
+                continue;
+            }
+
+            const int8_t * data       = reinterpret_cast<const int8_t *>(gguf_get_arr_data(ctx,   id));
+            const int8_t * data_other = reinterpret_cast<const int8_t *>(gguf_get_arr_data(other, idx_other));
+            if (!std::equal(data, data + arr_n*gguf_type_size(type_arr), data_other)) {
+                ok = false;
+            }
+            continue;
+        }
+
+        if (type == GGUF_TYPE_STRING) {
+            const std::string str       = gguf_get_val_str(ctx,   id);
+            const std::string str_other = gguf_get_val_str(other, idx_other);
+            if (str != str_other) {
+                ok = false;
+            }
+            continue;
+        }
+
+        const char * data       = reinterpret_cast<const char *>(gguf_get_val_data(ctx,   id));
+        const char * data_other = reinterpret_cast<const char *>(gguf_get_val_data(other, idx_other));
+        if (!std::equal(data, data + gguf_type_size(type), data_other)) {
+            ok = false;
+        }
+    }
+
+    return ok;
+}
+
+static bool all_tensors_in_other(const gguf_context * ctx, const gguf_context * other) {
+    bool ok = true;
+
+    const int n_tensors = gguf_get_n_tensors(ctx);
+    for (int id = 0; id < n_tensors; ++id) {
+        const std::string name = gguf_get_tensor_name(ctx, id);
+
+        const int idx_other = gguf_find_tensor(other, name.c_str());
+        if (id != idx_other) {
+            ok = false;
+            if (idx_other < 0) {
+                continue;
+            }
+        }
+
+        const ggml_type type = gguf_get_tensor_type(ctx, id);
+        if (type != gguf_get_tensor_type(other, id)) {
+            ok = false;
+        }
+
+        const size_t offset = gguf_get_tensor_offset(ctx, id);
+        if (offset != gguf_get_tensor_offset(other, id)) {
+            ok = false;
+        }
+    }
+
+    return ok;
+}
+
+static bool same_tensor_data(const struct ggml_context * orig, const struct ggml_context * read) {
+    bool ok = true;
+
+    struct ggml_tensor * t_orig = ggml_get_first_tensor(orig);
+    struct ggml_tensor * t_read = ggml_get_first_tensor(read);
+
+    if (std::string(t_read->name) != "GGUF tensor data binary blob") {
+        return false;
+    }
+    t_read = ggml_get_next_tensor(read, t_read);
+
+    while (t_orig) {
+        if (!t_read) {
+            ok = false;
+            break;
+        }
+
+        const size_t nbytes = ggml_nbytes(t_orig);
+        if (ggml_nbytes(t_read) != nbytes) {
+            ok = false;
+            break;
+        }
+        std::vector<char> data_orig(nbytes);
+        ggml_backend_tensor_get(t_orig, data_orig.data(), 0, nbytes);
+        if (!std::equal(data_orig.data(), data_orig.data() + nbytes, reinterpret_cast<const char *>(t_read->data))) {
+            ok = false;
+        }
+
+        t_orig = ggml_get_next_tensor(orig, t_orig);
+        t_read = ggml_get_next_tensor(read, t_read);
+    }
+    if (t_read) {
+        ok = false;
+    }
+
+    return ok;
+}
+
+static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
+    ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+    printf("%s: device=%s, backend=%s, only_meta=%s\n",
+        __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no");
+
+    int npass = 0;
+    int ntest = 0;
+
+    struct gguf_context * gguf_ctx_0;
+    struct ggml_context * ctx_0;
+    ggml_backend_buffer_t bbuf;
+    {
+        struct random_gguf_context_result result = get_random_gguf_context(backend, seed);
+        gguf_ctx_0 = result.gguf_ctx;
+        ctx_0      = result.ctx;
+        bbuf       = result.buffer;
+    }
+
+    FILE * file = tmpfile();
+
+#ifdef _WIN32
+    if (!file) {
+        printf("failed to create tmpfile(), needs elevated privileges on Windows");
+        printf("skipping tests");
+        return std::make_pair(0, 0);
+    }
+#else
+    GGML_ASSERT(file);
+#endif // _WIN32
+
+    {
+        std::vector<int8_t> buf;
+        gguf_write_to_buf(gguf_ctx_0, buf, only_meta);
+        GGML_ASSERT(fwrite(buf.data(), 1, buf.size(), file) == buf.size());
+        rewind(file);
+    }
+
+    struct ggml_context * ctx_1 = nullptr;
+    struct gguf_init_params gguf_params = {
+        /*no_alloc =*/ false,
+        /*ctx      =*/ only_meta ? nullptr : &ctx_1,
+    };
+    struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);
+
+    printf("%s: same_version: ", __func__);
+    if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: same_n_kv: ", __func__);
+    if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: same_n_tensors: ", __func__);
+    if (gguf_get_n_tensors(gguf_ctx_0) == gguf_get_n_tensors(gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_orig_kv_in_read: ", __func__);
+    if (all_kv_in_other(gguf_ctx_0, gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_read_kv_in_orig: ", __func__);
+    if (all_kv_in_other(gguf_ctx_1, gguf_ctx_0)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_orig_tensors_in_read: ", __func__);
+    if (all_tensors_in_other(gguf_ctx_0, gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_read_tensors_in_orig: ", __func__);
+    if (all_tensors_in_other(gguf_ctx_1, gguf_ctx_0)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    if (!only_meta) {
+        printf("%s: same_tensor_data: ", __func__);
+        if (same_tensor_data(ctx_0, ctx_1)) {
+            printf("\033[1;32mOK\033[0m\n");
+            npass++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+        ntest++;
+    }
+
+    ggml_backend_buffer_free(bbuf);
+    ggml_free(ctx_0);
+    ggml_free(ctx_1);
+    gguf_free(gguf_ctx_0);
+    gguf_free(gguf_ctx_1);
+    ggml_backend_free(backend);
+    fclose(file);
+
+    printf("\n");
+    return std::make_pair(npass, ntest);
+}
+
+static std::pair<int, int> test_gguf_set_kv(ggml_backend_dev_t dev, const unsigned int seed) {
+    ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+    printf("%s: device=%s, backend=%s\n", __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend));
+
+    int npass = 0;
+    int ntest = 0;
+
+    struct gguf_context * gguf_ctx_0;
+    struct ggml_context * ctx_0;
+    ggml_backend_buffer_t bbuf_0;
+    {
+        struct random_gguf_context_result result = get_random_gguf_context(backend, seed);
+        gguf_ctx_0 = result.gguf_ctx;
+        ctx_0      = result.ctx;
+        bbuf_0     = result.buffer;
+    }
+
+    struct gguf_context * gguf_ctx_1;
+    struct ggml_context * ctx_1;
+    ggml_backend_buffer_t bbuf_1;
+    {
+        struct random_gguf_context_result result = get_random_gguf_context(backend, seed + 1);
+        gguf_ctx_1 = result.gguf_ctx;
+        ctx_1      = result.ctx;
+        bbuf_1     = result.buffer;
+    }
+
+    struct gguf_context * gguf_ctx_2 = gguf_init_empty();
+
+    gguf_set_kv(gguf_ctx_1, gguf_ctx_0);
+    gguf_set_kv(gguf_ctx_2, gguf_ctx_0);
+
+    printf("%s: same_n_kv: ", __func__);
+    if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_2)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_kv_0_in_1: ", __func__);
+    if (all_kv_in_other(gguf_ctx_0, gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_kv_0_in_2: ", __func__);
+    if (all_kv_in_other(gguf_ctx_0, gguf_ctx_2)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    gguf_set_kv(gguf_ctx_0, gguf_ctx_1);
+
+    printf("%s: same_n_kv_after_double_copy: ", __func__);
+    if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_1)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    printf("%s: all_kv_1_in_0_after_double_copy: ", __func__);
+    if (all_kv_in_other(gguf_ctx_1, gguf_ctx_0)) {
+        printf("\033[1;32mOK\033[0m\n");
+        npass++;
+    } else {
+        printf("\033[1;31mFAIL\033[0m\n");
+    }
+    ntest++;
+
+    ggml_backend_buffer_free(bbuf_0);
+    ggml_backend_buffer_free(bbuf_1);
+    ggml_free(ctx_0);
+    ggml_free(ctx_1);
+    gguf_free(gguf_ctx_0);
+    gguf_free(gguf_ctx_1);
+    gguf_free(gguf_ctx_2);
+    ggml_backend_free(backend);
+
+    printf("\n");
+    return std::make_pair(npass, ntest);
+}
+
+static void print_usage() {
+    printf("usage: test-gguf [seed]\n");
+    printf("  if no seed is unspecified then a random seed is used\n");
+}
+
+int main(int argc, char ** argv) {
+    if (argc > 2) {
+        print_usage();
+        return 1;
+    }
+
+    std::random_device rd;
+    const unsigned int seed = argc < 2 ? rd() : std::stoi(argv[1]);
+
+    // Initialize ggml backends early so the prints aren't interleaved with the test results:
+    ggml_backend_dev_count();
+    fprintf(stderr, "\n");
+
+    int npass = 0;
+    int ntest = 0;
+    {
+        std::pair<int, int> result = test_handcrafted_file(seed);
+        npass += result.first;
+        ntest += result.second;
+    }
+
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+
+        for (bool only_meta : {true, false}) {
+            std::pair<int, int> result = test_roundtrip(dev, seed, only_meta);
+            npass += result.first;
+            ntest += result.second;
+        }
+
+        {
+            std::pair<int, int> result = test_gguf_set_kv(dev, seed);
+            npass += result.first;
+            ntest += result.second;
+        }
+    }
+
+    printf("%d/%d tests passed\n", npass, ntest);
+    if (npass != ntest) {
+        printf("\033[1;31mFAIL\033[0m\n");
+        return 1;
+    }
+    printf("\033[1;32mOK\033[0m\n");
+    return 0;
+}
diff --git a/llama.cpp/tests/test-grammar-integration.cpp b/llama.cpp/tests/test-grammar-integration.cpp
new file mode 100644
index 0000000..7aa7e58
--- /dev/null
+++ b/llama.cpp/tests/test-grammar-integration.cpp
@@ -0,0 +1,1439 @@
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include "json-schema-to-grammar.h"
+
+#include "../src/unicode.h"
+#include "../src/llama-grammar.h"
+
+#include <nlohmann/json.hpp>
+
+#include <cassert>
+#include <string>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+static llama_grammar * build_grammar(const std::string & grammar_str) {
+    return llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
+}
+
+static bool test_build_grammar_fails(const std::string & grammar_str) {
+    fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
+    bool grammar_fails = false;
+    llama_grammar * grammar = build_grammar(grammar_str);
+    if (grammar != nullptr) {
+        fprintf(stderr, "  ❌ Expected build failure, but succeeded\n");
+    } else {
+        grammar_fails = true;
+        fprintf(stdout, "  ✅︎\n");
+    }
+    return grammar_fails;
+}
+
+struct token_and_piece {
+    llama_token token;
+    std::string piece;
+};
+
+// token() encodes a 32-bit ID as 5 bytes: a 0xff marker followed by the ID in big-endian order.
+static std::string token(llama_token id) {
+    return std::string{
+        static_cast<char>(0xff),
+        static_cast<char>((id >> 24) & 0xff),
+        static_cast<char>((id >> 16) & 0xff),
+        static_cast<char>((id >> 8) & 0xff),
+        static_cast<char>(id & 0xff)
+    };
+}
+
+// parse_tokens() parses the token encodes above and UTF-8 text.
+static std::vector<token_and_piece> parse_tokens(const std::string & input) {
+    std::vector<token_and_piece> result;
+    result.reserve(input.size());
+    size_t offset = 0;
+    while (offset < input.size()) {
+        try {
+            if (static_cast<unsigned char>(input[offset]) == 0xff) {
+                if (offset + 5 > input.size()) {
+                    throw std::runtime_error("not enough bytes for token id");
+                }
+                uint32_t val =
+                    (static_cast<unsigned char>(input[offset + 1]) << 24) |
+                    (static_cast<unsigned char>(input[offset + 2]) << 16) |
+                    (static_cast<unsigned char>(input[offset + 3]) << 8)  |
+                    (static_cast<unsigned char>(input[offset + 4]));
+                auto piece = "<[" + std::to_string(val) + "]>";
+                result.push_back({static_cast<llama_token>(val), piece});
+                offset += 5;
+            } else {
+                uint32_t cpt = unicode_cpt_from_utf8(input, offset);
+                result.push_back({0, unicode_cpt_to_utf8(cpt)});
+            }
+        } catch (const std::invalid_argument & /*ex*/) {
+            // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
+            ++offset;
+            result.push_back({0, unicode_cpt_to_utf8(0xFFFD)}); // replacement character
+        }
+    }
+    return result;
+}
+
+static bool match_string(const std::string & input, llama_grammar * grammar) {
+    const auto parsed = parse_tokens(input);
+
+    auto & stacks_cur = llama_grammar_get_stacks(grammar);
+
+    for (const auto & in : parsed) {
+        try {
+            llama_grammar_accept_token(*grammar, in.token, in.piece);
+        } catch (const std::runtime_error & /*e*/) {
+            // normally this shouldn't get hit because of llama_grammar_apply
+            return false;
+        }
+
+        if (stacks_cur.empty()) {
+            // no stacks means that the grammar failed to match at this point
+            return false;
+        }
+    }
+
+    for (const auto & stack : stacks_cur) {
+        if (stack.empty()) {
+            // An empty stack means that the grammar has been completed
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static void test(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
+    fprintf(stderr, "⚫ Testing %s\n%s\n", test_desc.c_str(), grammar_str.c_str());
+    fflush(stderr);
+
+    auto * grammar = build_grammar(grammar_str);
+
+    // Save the original grammar stacks so that we can reset after every new string we want to test
+    const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar); // copy
+
+    llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
+
+    fprintf(stderr, "  🔵 Valid strings:\n");
+
+    // Passing strings
+    for (const auto & test_string : passing_strings) {
+        fprintf(stderr, "    \"%s\" ", test_string.c_str());
+        fflush(stderr);
+
+        bool matched = match_string(test_string, grammar);
+
+        if (!matched) {
+            fprintf(stderr, "❌ (failed to match)\n");
+
+            // DEBUG: Write strings to files so that we can analyze more easily with gbnf-validator program to see exactly where things failed.
+            // DEBUG: Write the grammar_str to test-grammar-integration.grammar.gbnf
+            FILE* grammar_file = fopen("test-grammar-integration.grammar.gbnf", "w");
+            if (grammar_file) {
+                fprintf(grammar_file, "%s", grammar_str.c_str());
+                fclose(grammar_file);
+            }
+
+            // DEBUG: Write the test string to test-grammar-integration.string.txt
+            FILE* string_file = fopen("test-grammar-integration.string.txt", "w");
+            if (string_file) {
+                fprintf(string_file, "%s", test_string.c_str());
+                fclose(string_file);
+            }
+
+            fprintf(stderr, "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following command:     ./llama-gbnf-validator test-grammar-integration.grammar.gbnf test-grammar-integration.string.txt\n\n");
+        } else {
+            fprintf(stdout, "✅︎\n");
+        }
+
+        assert(matched);
+
+        // Reset the grammar stacks
+        stacks_cur = stacks_org;
+    }
+
+    fprintf(stderr, "  🟠 Invalid strings:\n");
+
+    // Failing strings
+    for (const auto & test_string : failing_strings) {
+        fprintf(stderr, "    \"%s\" ", test_string.c_str());
+        fflush(stderr);
+
+        bool matched = match_string(test_string, grammar);
+
+        if (matched) {
+            fprintf(stderr, "❌ (incorrectly matched)\n");
+        } else {
+            fprintf(stdout, "✅︎\n");
+        }
+        assert(!matched);
+
+        // Reset the grammar stacks
+        stacks_cur = stacks_org;
+    }
+
+    // Clean up allocated memory
+    llama_grammar_free_impl(grammar);
+}
+static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
+    test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings);
+}
+static void test_schema(const std::string & test_desc, const std::string & schema_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
+    test(test_desc + ". Schema: " + schema_str, json_schema_to_grammar(json::parse(schema_str), true), passing_strings, failing_strings);
+}
+
+static void test_simple_grammar() {
+    test_schema(
+        "min 0",
+        R"""({
+            "type": "integer",
+            "minimum": 0
+        })""",
+        // Passing strings
+        {
+            "0",
+            "10",
+            "12",
+            "10000",
+        },
+        // Failing strings
+        {
+            "-1",
+            "-10",
+            "-10000",
+            "-100000000000000000000000000000000",
+            "100000000000000000000000000000000",
+            "00",
+            "01",
+            "-0",
+        }
+    );
+    test_schema(
+        "min 2",
+        // Schema
+        R"""({
+            "type": "integer",
+            "minimum": 2
+        })""",
+        // Passing strings
+        {
+            "2",
+            "3",
+            "4",
+            "10",
+            "20",
+            "1234567890000000",
+        },
+        // Failing strings
+        {
+            "0",
+            "1",
+            "-1",
+            "-100",
+            "0",
+            "1",
+            "01",
+            "02",
+            "12345678900000000",
+        }
+    );
+    test_schema(
+        "min 456",
+        R"""({
+            "type": "integer",
+            "minimum": 456
+        })""",
+        // Passing strings
+        {
+            "456",
+            "4560",
+            "457",
+            "460",
+            "500",
+        },
+        // Failing strings
+        {
+            "455",
+            "356",
+            "50",
+            "050",
+            "-1",
+            "-456",
+        }
+    );
+    test_schema(
+        "min -123",
+        R"""({
+            "type": "integer",
+            "minimum": -123
+        })""",
+        // Passing strings
+        {
+            "-123",
+            "-122",
+            "-11",
+            "-1",
+            "0",
+            "1",
+            "123",
+            "1234",
+            "2345",
+        },
+        // Failing strings
+        {
+            "-1234",
+            "-124",
+        }
+    );
+
+    test_schema(
+        "max 9999",
+        // Schema
+        R"""({
+            "type": "integer",
+            "maximum": 9999
+        })""",
+        // Passing strings
+        {
+            "-99999",
+            "0",
+            "9999",
+        },
+        // Failing strings
+        {
+            "10000",
+            "99991",
+        }
+    );
+    test_schema(
+        "max -9999",
+        // Schema
+        R"""({
+            "type": "integer",
+            "maximum": -9999
+        })""",
+        // Passing strings
+        {
+            "-10000",
+            "-9999",
+        },
+        // Failing strings
+        {
+            "-9998",
+            "0",
+            "9999",
+        }
+    );
+    test_schema(
+        "min 5 max 30",
+        // Schema
+        R"""({
+            "type": "integer",
+            "minimum": 5,
+            "maximum": 30
+        })""",
+        // Passing strings
+        {
+            "5",
+            "10",
+            "30",
+        },
+        // Failing strings
+        {
+            "05",
+            "4",
+            "-1",
+            "31",
+            "123",
+            "0123",
+        }
+    );
+    test_schema(
+        "min 1 max 900719925474091",
+        // Schema
+        R"""({
+            "type": "integer",
+            "exclusiveMinimum": 0,
+            "maximum": 900719925474091
+        })""",
+        // Passing strings
+        {
+            "1",
+            "2",
+            "10",
+            "900719925474090",
+            "900719925474091",
+        },
+        // Failing strings
+        {
+            "0",
+            "01",
+            "900719925474092",
+            "9007199254740910",
+        }
+    );
+    test_schema(
+        "min -1 max 1",
+        R"""({
+            "type": "integer",
+            "minimum": -1,
+            "maximum": 1
+        })""",
+        // Passing strings
+        {
+            "-1",
+            "0",
+            "1",
+        },
+        // Failing strings
+        {
+            "-11",
+            "-10",
+            "-2",
+            "2",
+            "10",
+            "11",
+        }
+    );
+    test_schema(
+        "min -123 max 42",
+        R"""({
+            "type": "integer",
+            "minimum": -123,
+            "maximum": 42
+        })""",
+        // Passing strings
+        {
+            "-123",
+            "-122",
+            "-13",
+            "-11",
+            "-2",
+            "-1",
+            "0",
+            "1",
+            "5",
+            "10",
+            "39",
+            "40",
+            "42",
+        },
+        // Failing strings
+        {
+            "-0123",
+            "-124",
+            "-1123",
+            "-200",
+            "43",
+            "123",
+            "0123",
+        }
+    );
+    test_schema(
+        "exclusive min / max",
+        // Schema
+        R"""({
+            "type": "integer",
+            "exclusiveMinimum": 0,
+            "exclusiveMaximum": 10000
+        })""",
+        // Passing strings
+        {
+            "1",
+            "9999",
+        },
+        // Failing strings
+        {
+            "0",
+            "01",
+            "10000",
+            "99999",
+        }
+    );
+
+    // Test case for a simple grammar
+    test_grammar(
+        "simple grammar",
+        R"""(
+            root ::= expr
+            expr ::= term ("+" term)*
+            term ::= number
+            number ::= [0-9]+)""",
+        // Passing strings
+        {
+            "42",
+            "1+2+3+4+5",
+            "123+456",
+        },
+        // Failing strings
+        {
+            "+",
+            "/ 3",
+            "1+2+3+4+5+",
+            "12a45",
+        }
+    );
+
+    // Test case for a simple grammar with tokens
+    test_grammar(
+        "simple grammar with tokens",
+        R"""(
+            root ::= <[10]> content <[11]>
+            content ::= (!<[11]>)*)""",
+        // Passing strings
+        {
+            token(10) + "hello world" + token(11),
+            token(10) + "text with " + token(12) + " other tokens " + token(13) + " mixed in" + token(11),
+            token(10) + token(11),
+            token(10) + token(12) + token(13) + token(14) + token(15) + token(11),
+            token(10) + "a" + token(11),
+        },
+        // Failing strings
+        {
+            token(10) + "missing end token",
+            token(10),
+            "missing start token" + token(11),
+            token(10) + token(11) + token(11),  // double end token
+            token(11) + "wrong order" + token(10),
+        }
+    );
+}
+
+static void test_complex_grammar() {
+    // Test case for a more complex grammar, with both failure strings and success strings
+    test_grammar(
+        "medium complexity grammar",
+        // Grammar
+        R"""(
+            root ::= expression
+            expression ::= term ws (("+"|"-") ws term)*
+            term ::= factor ws (("*"|"/") ws factor)*
+            factor ::= number | variable | "(" expression ")" | function-call
+            number ::= [0-9]+
+            variable ::= [a-zA-Z_][a-zA-Z0-9_]*
+            function-call ::= variable ws "(" (expression ("," ws expression)*)? ")"
+            ws ::= [ \t\n\r]?)""",
+        // Passing strings
+        {
+            "42",
+            "1*2*3*4*5",
+            "x",
+            "x+10",
+            "x1+y2",
+            "(a+b)*(c-d)",
+            "func()",
+            "func(x,y+2)",
+            "a*(b+c)-d/e",
+            "f(g(x),h(y,z))",
+            "x + 10",
+            "x1 + y2",
+            "(a + b) * (c - d)",
+            "func()",
+            "func(x, y + 2)",
+            "a * (b + c) - d / e",
+            "f(g(x), h(y, z))",
+            "123+456",
+            "123*456*789-123/456+789*123",
+            "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456"
+        },
+        // Failing strings
+        {
+            "+",
+            "/ 3x",
+            "x + + y",
+            "a * / b",
+            "func(,)",
+            "func(x y)",
+            "(a + b",
+            "x + y)",
+            "a + b * (c - d",
+            "42 +",
+            "x +",
+            "x + 10 +",
+            "(a + b) * (c - d",
+            "func(",
+            "func(x, y + 2",
+            "a * (b + c) - d /",
+            "f(g(x), h(y, z)",
+            "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/",
+        }
+    );
+
+    // Test case for a more complex grammar with tokens
+    test_grammar(
+        "complex grammar with tokens",
+        R"""(
+            root ::= reasoning+ content tool-call*
+            reasoning ::= <[10]> (!<[11]>)* <[11]>
+            content ::= <[20]> (!<[21]>)* <[21]>
+            tool-call ::= <[12]> name <[13]> args <[14]>
+            name ::= (!<[13]>)+
+            args ::= (!<[14]>)*)""",
+        // Passing strings
+        {
+            token(10) + "I am thinking" + token(11) + token(20) + "hello world!" + token(21) + token(12) + "search" + token(13) + "query=test" + token(14),
+            token(10) + "reasoning 1" + token(11) + token(10) + "reasoning 2" + token(11) + token(20) + token(21) + token(12) + "tool" + token(13) + token(14),
+            token(10) + token(11) + token(20) + "content" + token(21),
+            token(10) + "think" + token(12) + " nested" + token(11) + token(20) + token(10) + "more content" + token(21) + token(12) + "fn" + token(13) + "x=1,y=2" + token(14) + token(12) + "fn2" + token(13) + token(14),
+            token(10) + "reasoning" + token(11) + token(10) + "more" + token(11) + token(10) + "even more" + token(11) + token(20) + "text" + token(21) + token(12) + "a" + token(13) + "b" + token(14) + token(12) + "c" + token(13) + "d" + token(14),
+        },
+        // Failing strings
+        {
+            token(20) + "content only" + token(21),
+            token(10) + "no closing reasoning",
+            token(10) + token(11) + token(20) + "no closing content",
+            token(10) + token(11) + token(20) + token(21) + token(12) + "incomplete tool",
+            token(10) + token(11) + token(11) + token(20) + token(21),
+        }
+    );
+}
+
+static void test_special_chars() {
+    // A collection of tests to exercise special characters such as "."
+    test_grammar(
+        "special characters",
+        // Grammar
+        R"""(
+            root ::= ... "abc" ...
+            )""",
+        // Passing strings
+        {
+            "abcabcabc",
+            "aaaabcccc",
+            // NOTE: Also ensures that multi-byte characters still count as a single character
+            "🔵🟠✅abc❌🟠🔵"
+        },
+        // Failing strings
+        {
+            "aaabcccc",
+            "aaaaabcccc",
+            "aaaabccc",
+            "aaaabccccc",
+            "🔵🟠✅❌abc❌✅🟠🔵",
+            "🔵🟠abc🟠🔵"
+        }
+    );
+}
+
+static void test_quantifiers() {
+    // A collection of tests to exercise * + and ? quantifiers
+
+    test_grammar(
+        "* quantifier",
+        // Grammar
+        R"""(root ::= "a"*)""",
+        // Passing strings
+        {
+            "",
+            "a",
+            "aaaaa",
+            "aaaaaaaaaaaaaaaaaa",
+            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+        },
+        // Failing strings
+        {
+            "b",
+            "ab",
+            "aab",
+            "ba",
+            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"
+        }
+    );
+    test_grammar(
+        "+ quantifier",
+        // Grammar
+        R"""(root ::= "a"+)""",
+        // Passing strings
+        {
+            "a",
+            "aaaaa",
+            "aaaaaaaaaaaaaaaaaa",
+            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+        },
+        // Failing strings
+        {
+            "",
+            "b",
+            "ab",
+            "aab",
+            "ba",
+            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"
+        }
+    );
+    test_grammar(
+        "? quantifier",
+        // Grammar
+        R"""(root ::= "a"?)""",
+        // Passing strings
+        {
+            "",
+            "a"
+        },
+        // Failing strings
+        {
+            "b",
+            "ab",
+            "aa",
+            "ba",
+        }
+    );
+    test_grammar(
+        "mixed quantifiers",
+        // Grammar
+        R"""(
+            root ::= cons+ vowel* cons? (vowel cons)*
+            vowel ::= [aeiouy]
+            cons ::= [bcdfghjklmnpqrstvwxyz]
+            )""",
+        // Passing strings
+        {
+            "yes",
+            "no",
+            "noyes",
+            "crwth",
+            "four",
+            "bryyyy",
+        },
+        // Failing strings
+        {
+            "yess",
+            "yesno",
+            "forty",
+            "catyyy",
+        }
+    );
+    test_grammar(
+        "simple exact repetition",
+        // Grammar
+        R"""(
+            root ::= [ab]{4}
+        )""",
+        // Passing strings
+        {
+            "aaaa",
+            "bbbb",
+            "abab",
+        },
+        // Failing strings
+        {
+            "a",
+            "b",
+            "aaaaa",
+        }
+    );
+    test_grammar(
+        "simple min repetition",
+        // Grammar
+        R"""(
+            root ::= [ab]{4,}
+        )""",
+        // Passing strings
+        {
+            "aaaa",
+            "aaaaab",
+            "bbbb",
+            "ababab",
+        },
+        // Failing strings
+        {
+            "",
+            "aba",
+        }
+    );
+    test_grammar(
+        "simple max repetition",
+        // Grammar
+        R"""(
+            root ::= [ab]{0,4}
+        )""",
+        // Passing strings
+        {
+            "",
+            "a",
+            "aa",
+            "aaa",
+            "aaab",
+        },
+        // Failing strings
+        {
+            "aaaaa",
+        }
+    );
+    test_grammar(
+        "min / max repetition",
+        // Grammar
+        R"""(
+            root ::= ("0x" [A-F0-9]{2} " "?){3,5}
+        )""",
+        // Passing strings
+        {
+            "0xFF 0x12 0xAB",
+            "0xFF 0x12 0xAB 0x00 0x00",
+        },
+        // Failing strings
+        {
+            "",
+            "0xFF",
+            "0xFF 0x12",
+            "0xFF 0x12 0xAB 0x00 0x00 0x00",
+        }
+    );
+}
+
+static void test_failure_missing_root() {
+    fprintf(stderr, "⚫ Testing missing root node:\n");
+    // Test case for a grammar that is missing a root rule
+    const std::string grammar_str = R"""(
+        rot ::= expr
+        expr ::= term ("+" term)*
+        term ::= number
+        number ::= [0-9]+)""";
+
+    llama_grammar_parser parsed_grammar;
+    parsed_grammar.parse(grammar_str.c_str());
+
+    // Ensure we parsed correctly
+    assert(!parsed_grammar.rules.empty());
+
+    // Ensure we do NOT have a root node
+    assert(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end());
+    fprintf(stderr, "  ✅︎ Passed\n");
+}
+
+static void test_failure_missing_reference() {
+    fprintf(stderr, "⚫ Testing missing reference node:\n");
+
+    // Test case for a grammar that is missing a referenced rule
+    const std::string grammar_str =
+        R"""(root ::= expr
+        expr ::= term ("+" term)*
+        term ::= numero
+        number ::= [0-9]+)""";
+
+    fprintf(stderr, "    Expected error:  ");
+
+    llama_grammar_parser parsed_grammar;
+    parsed_grammar.parse(grammar_str.c_str());
+
+    // Ensure we did NOT parsed correctly
+    assert(parsed_grammar.rules.empty());
+
+    fprintf(stderr, "    End of expected error.\n");
+    fprintf(stderr, "  ✅︎ Passed\n");
+}
+
+static void test_failure_left_recursion() {
+    fprintf(stderr, "⚫ Testing left recursion detection:\n");
+
+    // Test simple left recursion detection
+    const std::string simple_str = R"""(root ::= "a" | root "a")""";
+    assert(test_build_grammar_fails(simple_str));
+
+    // Test more complicated left recursion detection
+    const std::string medium_str = R"""(
+        root ::= asdf
+        asdf ::= "a" | asdf "a"
+        )""";
+    assert(test_build_grammar_fails(medium_str));
+
+    // Test even more complicated left recursion detection
+    const std::string hard_str = R"""(
+        root ::= asdf
+        asdf ::= "a" | foo "b"
+        foo ::= "c" | asdf "d" | "e")""";
+    assert(test_build_grammar_fails(hard_str));
+
+    // Test yet even more complicated left recursion detection
+    const std::string hardest_str = R"""(
+        root ::= asdf
+        asdf ::= "a" | foo "b"
+        foo ::= "c" | empty asdf "d" | "e"
+        empty ::= "blah" | )""";
+    assert(test_build_grammar_fails(hardest_str));
+
+    fprintf(stderr, "  ✅︎ Passed\n");
+}
+
+static void test_json_schema() {
+    // Note that this is similar to the regular grammar tests,
+    //  but we convert each json schema to a grammar before parsing.
+    // Otherwise, this test structure is the same.
+
+    test_schema(
+        "empty schema (object)",
+        // Schema
+        R"""(
+            {}
+        )""",
+        // Passing strings
+        {
+            R"""({})""",
+            R"""({"foo": "bar"})""",
+        },
+        // Failing strings
+        {
+            "",
+            "[]",
+            "null",
+            R"""("")""",
+            "true",
+        }
+    );
+
+    test_schema(
+        "exotic formats (list)",
+        // Schema
+        R"""({
+            "items": [
+                { "format": "date" },
+                { "format": "uuid" },
+                { "format": "time" },
+                { "format": "date-time" }
+            ]
+        })""",
+        // Passing strings
+        {
+            // "{}", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            // "[]", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            R"""(["2012-04-23", "12345678-1234-1234-1234-1234567890ab", "18:25:43.511Z", "2012-04-23T18:25:43.511Z"])""",
+            //R"""(["2012-04-23","12345678-1234-1234-1234-1234567890ab"])""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            //R"""({"foo": "bar"})""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+        },
+        // Failing strings
+        {
+            R"""(["foo", "bar"])""",
+            R"""(["12345678-1234-1234-1234-1234567890ab"])""",
+        }
+    );
+
+    test_schema(
+        "string",
+        // Schema
+        R"""({
+            "type": "string"
+        })""",
+        // Passing strings
+        {
+            R"""("foo")""",
+            R"""("bar")""",
+            R"""("")""",
+        },
+        // Failing strings
+        {
+            R"""({})""",
+            R"""("foo": "bar")""",
+        }
+    );
+
+    test_schema(
+        "string w/ min length 1",
+        // Schema
+        R"""({
+            "type": "string",
+            "minLength": 1
+        })""",
+        // Passing strings
+        {
+            R"""("foo")""",
+            R"""("bar")""",
+        },
+        // Failing strings
+        {
+            R"""("")""",
+            R"""({})""",
+            R"""("foo": "bar")""",
+        }
+    );
+
+    test_schema(
+        "string w/ min length 3",
+        // Schema
+        R"""({
+                "type": "string",
+                "minLength": 3
+        })""",
+        // Passing strings
+        {
+            R"""("foo")""",
+            R"""("bar")""",
+            R"""("foobar")""",
+        },
+        // Failing strings
+        {
+            R"""("")""",
+            R"""("f")""",
+            R"""("fo")""",
+        }
+    );
+
+    test_schema(
+        "string w/ max length",
+        // Schema
+        R"""({
+            "type": "string",
+            "maxLength": 3
+        })""",
+        // Passing strings
+        {
+            R"""("foo")""",
+            R"""("bar")""",
+            R"""("")""",
+            R"""("f")""",
+            R"""("fo")""",
+        },
+        // Failing strings
+        {
+            R"""("foobar")""",
+        }
+    );
+
+    test_schema(
+        "string w/ min & max length",
+        // Schema
+        R"""({
+            "type": "string",
+            "minLength": 1,
+            "maxLength": 4
+        })""",
+        // Passing strings
+        {
+            R"""("foo")""",
+            R"""("bar")""",
+            R"""("f")""",
+            R"""("barf")""",
+        },
+        // Failing strings
+        {
+            R"""("")""",
+            R"""("barfo")""",
+            R"""("foobar")""",
+        }
+    );
+
+    test_schema(
+        "boolean",
+        // Schema
+        R"""({
+            "type": "boolean"
+        })""",
+        // Passing strings
+        {
+            "true",
+            "false",
+        },
+        // Failing strings
+        {
+            R"""("")""",
+            R"""("true")""",
+            R"""(True)""",
+            R"""(FALSE)""",
+        }
+    );
+
+    test_schema(
+        "integer",
+        // Schema
+        R"""({
+            "type": "integer"
+        })""",
+        // Passing strings
+        {
+            R"""(0)""",
+            R"""(12345)""",
+            R"""(1234567890123456)""",
+        },
+        // Failing strings
+        {
+            R"""()""",
+            R"""(01)""",
+            R"""(007)""",
+            R"""(12345678901234567  )""",
+        }
+    );
+
+    test_schema(
+        "string const",
+        // Schema
+        R"""({
+            "const": "foo"
+        })""",
+        // Passing strings
+        {
+            R"""("foo")""",
+        },
+        // Failing strings
+        {
+            R"""(foo)""",
+            R"""("bar")""",
+        }
+    );
+
+    test_schema(
+        "non-string const",
+        // Schema
+        R"""({
+            "const": true
+        })""",
+        // Passing strings
+        {
+            R"""(true)""",
+        },
+        // Failing strings
+        {
+            R"""()""",
+            R"""(foo)""",
+            R"""("true")""",
+        }
+    );
+
+    test_schema(
+        "non-string const",
+        // Schema
+        R"""({
+            "enum": ["red", "amber", "green", null, 42, ["foo"]]
+        })""",
+        // Passing strings
+        {
+            R"""("red")""",
+            R"""(null)""",
+            R"""(42)""",
+            R"""(["foo"])""",
+        },
+        // Failing strings
+        {
+            R"""()""",
+            R"""(420)""",
+            R"""(true)""",
+            R"""(foo)""",
+        }
+    );
+
+    test_schema(
+        "simple pattern",
+        // Schema
+        R"""({
+            "pattern": "^[a-zA-Z0-9_-]*$"
+        })""",
+        // Passing strings
+        {
+            R"""("")""",
+            R"""("He_llo-12")""",
+        },
+        // Failing strings
+        {
+            R"""("!")""",
+            R"""("Hello World")""",
+        }
+    );
+
+    test_schema(
+        "pattern with escapes",
+        // Schema
+        R"""({
+            "pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$"
+        })""",
+        // Passing strings
+        {
+            R"""("a^$.[]()|{}*+?b")""",
+        },
+        // Failing strings
+        {
+            R"""("ab")""",
+        }
+    );
+
+    test_schema(
+        "",
+        // Schema
+        R"""(
+            {
+                "type": ["array", "null"],
+                "items": { "type": "string" }
+            }
+        )""",
+        // Passing strings
+        {
+            "null",
+            "[]",
+            "[\"123\"]",
+            "[\"foo\", \"bar\"]",
+        },
+        // Failing strings
+        {
+            "",
+            "[123]",
+            "\"foo\"",
+            "[\"foo\", 42]",
+        }
+    );
+
+    test_schema(
+        "min+max items",
+        // Schema
+        R"""({
+            "items": {
+                "type": ["number", "integer"]
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
+        // Passing strings
+        {
+            R"""([1, 2, 3])""",
+            R"""([1, 2, 3, 4])""",
+            R"""([1, 2, 3, 4, 5])""",
+        },
+        // Failing strings
+        {
+            R"""([1, 2])""",
+            R"""([1, 2, 3, 4, 5, 6])""",
+            R"""(1)""",
+        }
+    );
+
+    // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
+    test_schema(
+        "object properties",
+        // Schema
+        R"""({
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            }
+        })""",
+        // Passing strings
+        {
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // "By default, leaving out properties is valid"
+            R"""({ "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+            // "By extension, even an empty object is valid"
+            R"""({})""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+        },
+        // Failing strings
+        {
+            // Change datatype from number to string
+            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // Reorder properties
+            R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
+            // Reorder properties
+            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // "Additional properties default to false for generation, even though the spec says true.
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
+
+        }
+    );
+
+    test_schema(
+        "additional properties can't override other properties",
+        R"""({
+            "properties": {
+                "a": {"type": "integer"},
+                "b": {"type": "integer"}
+            },
+            "additionalProperties": true
+        })""",
+        // Passing strings
+        {
+            R"""({"a": 42})""",
+            R"""({"c": ""})""",
+            R"""({"a": 42, "c": ""})""",
+            R"""({"a_": ""})""",
+        },
+        // Failing strings
+        {
+            R"""()""",
+            R"""({"a": ""})""",
+            R"""({"a": "", "b": ""})""",
+        }
+    );
+
+    // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
+    test_schema(
+        "object properties, additionalProperties: true",
+        // Schema
+        R"""({
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            },
+            "additionalProperties": true
+        })""",
+        // Passing strings
+        {
+            // "By extension, even an empty object is valid"
+            R"""({})""",
+            R"""({"number":1600,"street_name":"Pennsylvania","street_type":"Avenue"})""",
+            // "By default, leaving out properties is valid"
+            R"""({ "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+            // "By default, providing additional properties is valid"
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+        },
+        // Failing strings
+        {
+            // Change datatype from number to string
+            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // Reorder properties
+            R"""({ "street_name": "Pennsylvania", "number": 1600, "street_type":"Avenue"})""",
+        }
+    );
+
+    // Additional properties: false
+    test_schema(
+        "required + optional props each in original order",
+        // Schema
+        R"""({
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            },
+            "additionalProperties": false
+        })""",
+        // Passing strings
+        {
+            R"""({ "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_type":"Avenue"})""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // Spaces are permitted around enum values
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+        },
+        // Failing strings
+        {
+            // Reorder properties
+            R"""({ "street_type": "Avenue", "number": 1600 })""",
+            // Add "direction"
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue", "direction": "NW" })""",
+        }
+    );
+
+    test_schema(
+        "required + optional props each in original order",
+        // Schema
+        R"""({
+            "properties": {
+                "b": {"type": "string"},
+                "a": {"type": "string"},
+                "d": {"type": "string"},
+                "c": {"type": "string"}
+            },
+            "required": ["a", "b"],
+            "additionalProperties": false
+        })""",
+        // Passing strings
+        {
+            R"""({"b": "foo", "a": "bar"})""",
+            R"""({"b":"foo","a":"bar","d":"qux"})""",
+            R"""({"b":"foo", "a":"bar", "d":"qux", "c":"baz"})""",
+        },
+        // Failing strings
+        {
+            R"""({"a": "foo", "b": "bar"})""",
+            R"""({"b": "bar"})""",
+            R"""({"a": "foo", "c": "baz"})""",
+            R"""({"a":"foo", "b":"bar", "c":"baz", "d":"qux"})""",
+        }
+    );
+
+    // NOTE: Example from https://json-schema.org/learn/getting-started-step-by-step#define-required-properties
+    test_schema(
+        "required props",
+        // Schema
+        R"""({
+            "$schema": "https://json-schema.org/draft/2020-12/schema",
+            "$id": "https://example.com/product.schema.json",
+            "title": "Product",
+            "description": "A product from Acme's catalog",
+            "type": "object",
+            "properties": {
+                "productId": {
+                "description": "The unique identifier for a product",
+                "type": "integer"
+                },
+                "productName": {
+                "description": "Name of the product",
+                "type": "string"
+                },
+                "price": {
+                "description": "The price of the product",
+                "type": "number",
+                "exclusiveMinimum": 0
+                },
+                "tags": {
+                "description": "Tags for the product",
+                "type": "array",
+                "items": {
+                    "type": "string"
+                },
+                "minItems": 1,
+                "uniqueItems": true
+                },
+                "dimensions": {
+                "type": "object",
+                "properties": {
+                    "length": {
+                    "type": "number"
+                    },
+                    "width": {
+                    "type": "number"
+                    },
+                    "height": {
+                    "type": "number"
+                    }
+                },
+                "required": [ "length", "width", "height" ]
+                }
+            },
+            "required": [ "productId", "productName", "price" ]
+        })""",
+        // Passing strings
+        {
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50})""",
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"]})""",
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"], "dimensions": {"length": 785, "width": 250.5, "height": -0.359}})""",
+        },
+        // Failing strings
+        {
+            R"""({})""", // Missing all required properties
+            R"""({"productName": "A green door", "price": 12.50, "productId": 1})""", // Out of order properties
+            // TODO: The following line should fail, but currently it passes. `exclusiveMinimum` is not supported, as it would likely be too difficult to implement.
+            //  Perhaps special checks for minimum and maximum values of 0 could be added (since that's relatively easy to do with grammars), but anything else would likely be too complex.
+            // R"""({"productId": 1, "productName": "A green door", "price": -12.50})""",
+            R"""({"productId": 1, "productName": "A green door"})""", // Missing required property (price)
+            R"""({"productName": "A green door", "price": 12.50})""", // Missing required property (productId)
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": []})""", // tags is empty, but minItems is 1
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "dimensions": {"length": 785, "width": 250.5, "height": -0.359}, "tags": ["home", "green"]})""", // Tags and dimensions are out of order
+            // TODO: The following line should fail, but currently it passes. `uniqueItems` is not supported, as it would likely be too difficult to implement.
+            // R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green", "home"]})""",
+        }
+    );
+}
+
+int main() {
+    fprintf(stdout, "Running grammar integration tests...\n");
+    test_simple_grammar();
+    test_complex_grammar();
+    test_special_chars();
+    test_quantifiers();
+    test_failure_missing_root();
+    test_failure_missing_reference();
+    test_failure_left_recursion();
+    test_json_schema();
+    fprintf(stdout, "All tests passed.\n");
+    return 0;
+}
diff --git a/llama.cpp/tests/test-grammar-llguidance.cpp b/llama.cpp/tests/test-grammar-llguidance.cpp
new file mode 100644
index 0000000..34746c2
--- /dev/null
+++ b/llama.cpp/tests/test-grammar-llguidance.cpp
@@ -0,0 +1,1204 @@
+#ifdef NDEBUG
+#    undef NDEBUG
+#endif
+
+#include "sampling.h"
+
+#include <cassert>
+#include <string>
+#include <vector>
+
+static const llama_vocab * vocab;
+
+static bool match_string(const std::string & input, llama_sampler * grammar) {
+    llama_sampler_reset(grammar);
+    auto tokens = common_tokenize(vocab, input, false, false);
+
+    auto n_vocab = llama_vocab_n_tokens(vocab);
+
+    std::vector<llama_token_data> cur;
+    cur.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
+        cur.emplace_back(llama_token_data{ token_id, 0.0f, 0.0f });
+    }
+    auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), -1, false };
+
+    for (const auto token : tokens) {
+        for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
+            cur[token_id].logit = 0.0f;
+        }
+        llama_sampler_apply(grammar, &tok_arr);
+        if (cur[token].logit < 0.0f) {
+            return false;
+        }
+        llama_sampler_accept(grammar, token);
+    }
+
+    // do we allow EOS at the end? if so the grammar is accepting
+
+    auto tok_eos = llama_vocab_eot(vocab);
+    if (tok_eos == LLAMA_TOKEN_NULL) {
+        tok_eos = llama_vocab_eos(vocab);
+    }
+
+    cur[tok_eos].logit = 0.0f;
+    llama_sampler_apply(grammar, &tok_arr);
+
+    return cur[tok_eos].logit >= 0.0f;
+}
+
+static void test(const std::string & test_desc, const std::string & grammar_str,
+                 const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
+    fprintf(stderr, "⚫ Testing %s\n%s\n", test_desc.c_str(), grammar_str.c_str());
+    fflush(stderr);
+
+    auto * grammar = llama_sampler_init_llg(vocab, "lark", grammar_str.c_str());
+
+    fprintf(stderr, "  🔵 Valid strings:\n");
+
+    // Passing strings
+    for (const auto & test_string : passing_strings) {
+        fprintf(stderr, "    \"%s\" ", test_string.c_str());
+        fflush(stderr);
+
+        bool matched = match_string(test_string, grammar);
+
+        if (!matched) {
+            fprintf(stderr, "❌ (failed to match)\n");
+
+            // DEBUG: Write strings to files so that we can analyze more easily with gbnf-validator program to see exactly where things failed.
+            // DEBUG: Write the grammar_str to test-grammar-integration.grammar.gbnf
+            FILE * grammar_file = fopen("test-grammar-integration.grammar.gbnf", "w");
+            if (grammar_file) {
+                fprintf(grammar_file, "%s", grammar_str.c_str());
+                fclose(grammar_file);
+            }
+
+            // DEBUG: Write the test string to test-grammar-integration.string.txt
+            FILE * string_file = fopen("test-grammar-integration.string.txt", "w");
+            if (string_file) {
+                fprintf(string_file, "%s", test_string.c_str());
+                fclose(string_file);
+            }
+
+            fprintf(stderr,
+                    "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following "
+                    "command:     ./test-gbnf-validator test-grammar-integration.grammar.gbnf "
+                    "test-grammar-integration.string.txt\n\n");
+        } else {
+            fprintf(stdout, "✅︎\n");
+        }
+
+        assert(matched);
+    }
+
+    fprintf(stderr, "  🟠 Invalid strings:\n");
+
+    // Failing strings
+    for (const auto & test_string : failing_strings) {
+        fprintf(stderr, "    \"%s\" ", test_string.c_str());
+        fflush(stderr);
+
+        bool matched = match_string(test_string, grammar);
+
+        if (matched) {
+            fprintf(stderr, "❌ (incorrectly matched)\n");
+        } else {
+            fprintf(stdout, "✅︎\n");
+        }
+        assert(!matched);
+    }
+
+    llama_sampler_free(grammar);
+}
+
+static void test_grammar(const std::string & test_desc, const std::string & grammar_str,
+                         const std::vector<std::string> & passing_strings,
+                         const std::vector<std::string> & failing_strings) {
+    test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings);
+}
+
+static void test_schema(const std::string & test_desc, const std::string & schema_str,
+                        const std::vector<std::string> & passing_strings,
+                        const std::vector<std::string> & failing_strings) {
+    test(test_desc + ". Schema: " + schema_str, "%llguidance {}\nstart: %json " + schema_str, passing_strings,
+         failing_strings);
+}
+
+static void test_simple_grammar() {
+    test_schema("min 0",
+                R"""({
+            "type": "integer",
+            "minimum": 0
+        })""",
+                // Passing strings
+                {
+                    "0",
+                    "10",
+                    "12",
+                    "10000",
+                },
+                // Failing strings
+                {
+                    "-1",
+                    "-10",
+                    "-10000",
+                    "-100000000000000000000000000000000",
+                    // "100000000000000000000000000000000",
+                    "00",
+                    "01",
+                    "-0",
+                });
+    test_schema("min 2",
+                // Schema
+                R"""({
+            "type": "integer",
+            "minimum": 2
+        })""",
+                // Passing strings
+                {
+                    "2",
+                    "3",
+                    "4",
+                    "10",
+                    "20",
+                    "1234567890000000",
+                },
+                // Failing strings
+                {
+                    "0", "1", "-1", "-100", "0", "1", "01", "02",
+                    // "12345678900000000",
+                });
+    test_schema("min 456",
+                R"""({
+            "type": "integer",
+            "minimum": 456
+        })""",
+                // Passing strings
+                {
+                    "456",
+                    "4560",
+                    "457",
+                    "460",
+                    "500",
+                },
+                // Failing strings
+                {
+                    "455",
+                    "356",
+                    "50",
+                    "050",
+                    "-1",
+                    "-456",
+                });
+    test_schema("min -123",
+                R"""({
+            "type": "integer",
+            "minimum": -123
+        })""",
+                // Passing strings
+                {
+                    "-123",
+                    "-122",
+                    "-11",
+                    "-1",
+                    "0",
+                    "1",
+                    "123",
+                    "1234",
+                    "2345",
+                },
+                // Failing strings
+                {
+                    "-1234",
+                    "-124",
+                });
+
+    test_schema("max 9999",
+                // Schema
+                R"""({
+            "type": "integer",
+            "maximum": 9999
+        })""",
+                // Passing strings
+                {
+                    "-99999",
+                    "0",
+                    "9999",
+                },
+                // Failing strings
+                {
+                    "10000",
+                    "99991",
+                });
+    test_schema("max -9999",
+                // Schema
+                R"""({
+            "type": "integer",
+            "maximum": -9999
+        })""",
+                // Passing strings
+                {
+                    "-10000",
+                    "-9999",
+                },
+                // Failing strings
+                {
+                    "-9998",
+                    "0",
+                    "9999",
+                });
+    test_schema("min 5 max 30",
+                // Schema
+                R"""({
+            "type": "integer",
+            "minimum": 5,
+            "maximum": 30
+        })""",
+                // Passing strings
+                {
+                    "5",
+                    "10",
+                    "30",
+                },
+                // Failing strings
+                {
+                    "05",
+                    "4",
+                    "-1",
+                    "31",
+                    "123",
+                    "0123",
+                });
+    test_schema("min -1 max 1",
+                R"""({
+            "type": "integer",
+            "minimum": -1,
+            "maximum": 1
+        })""",
+                // Passing strings
+                {
+                    "-1",
+                    "0",
+                    "1",
+                },
+                // Failing strings
+                {
+                    "-11",
+                    "-10",
+                    "-2",
+                    "2",
+                    "10",
+                    "11",
+                });
+    test_schema("min -123 max 42",
+                R"""({
+            "type": "integer",
+            "minimum": -123,
+            "maximum": 42
+        })""",
+                // Passing strings
+                {
+                    "-123",
+                    "-122",
+                    "-13",
+                    "-11",
+                    "-2",
+                    "-1",
+                    "0",
+                    "1",
+                    "5",
+                    "10",
+                    "39",
+                    "40",
+                    "42",
+                },
+                // Failing strings
+                {
+                    "-0123",
+                    "-124",
+                    "-1123",
+                    "-200",
+                    "43",
+                    "123",
+                    "0123",
+                });
+    test_schema("exclusive min / max",
+                // Schema
+                R"""({
+            "type": "integer",
+            "exclusiveMinimum": 0,
+            "exclusiveMaximum": 10000
+        })""",
+                // Passing strings
+                {
+                    "1",
+                    "9999",
+                },
+                // Failing strings
+                {
+                    "0",
+                    "01",
+                    "10000",
+                    "99999",
+                });
+
+    // Test case for a simple grammar
+    test_grammar("simple grammar",
+                 R"""(
+            start: expr
+            expr: term ("+" term)*
+            term: number
+            number: /[0-9]+/ )""",
+                 // Passing strings
+                 {
+                     "42",
+                     "1+2+3+4+5",
+                     "123+456",
+                 },
+                 // Failing strings
+                 {
+                     "+",
+                     "/ 3",
+                     "1+2+3+4+5+",
+                     "12a45",
+                 });
+}
+
+static void test_complex_grammar() {
+    // Test case for a more complex grammar, with both failure strings and success strings
+    test_grammar("medium complexity grammar",
+                 // Grammar
+                 R"""(
+            start: expression
+            expression: term ws (("+"|"-") ws term)*
+            term: factor ws (("*"|"/") ws factor)*
+            factor: number | variable | "(" expression ")" | function-call
+            number: /[0-9]+/
+            variable: /[a-zA-Z_][a-zA-Z0-9_]*/
+            function-call: variable ws "(" (expression ("," ws expression)*)? ")"
+            ws: /[ \t\n\r]?/ )""",
+                 // Passing strings
+                 { "42",
+                   "1*2*3*4*5",
+                   "x",
+                   "x+10",
+                   "x1+y2",
+                   "(a+b)*(c-d)",
+                   "func()",
+                   "func(x,y+2)",
+                   "a*(b+c)-d/e",
+                   "f(g(x),h(y,z))",
+                   "x + 10",
+                   "x1 + y2",
+                   "(a + b) * (c - d)",
+                   "func()",
+                   "func(x, y + 2)",
+                   "a * (b + c) - d / e",
+                   "f(g(x), h(y, z))",
+                   "123+456",
+                   "123*456*789-123/456+789*123",
+                   "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456" },
+                 // Failing strings
+                 {
+                     "+",
+                     "/ 3x",
+                     "x + + y",
+                     "a * / b",
+                     "func(,)",
+                     "func(x y)",
+                     "(a + b",
+                     "x + y)",
+                     "a + b * (c - d",
+                     "42 +",
+                     "x +",
+                     "x + 10 +",
+                     "(a + b) * (c - d",
+                     "func(",
+                     "func(x, y + 2",
+                     "a * (b + c) - d /",
+                     "f(g(x), h(y, z)",
+                     "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/",
+                 });
+}
+
+static void test_special_chars() {
+    // A collection of tests to exercise special characters such as "."
+    test_grammar("special characters",
+                 // Grammar
+                 R"""(
+            start: /.../ "abc" /.../
+            )""",
+                 // Passing strings
+                 { "abcabcabc", "aaaabcccc",
+                   // NOTE: Also ensures that multi-byte characters still count as a single character
+                   "🔵🟠✅abc❌🟠🔵" },
+                 // Failing strings
+                 { "aaabcccc", "aaaaabcccc", "aaaabccc", "aaaabccccc", "🔵🟠✅❌abc❌✅🟠🔵", "🔵🟠abc🟠🔵" });
+}
+
+static void test_quantifiers() {
+    // A collection of tests to exercise * + and ? quantifiers
+
+    test_grammar(
+        "* quantifier",
+        // Grammar
+        R"""(start: "a"*)""",
+        // Passing strings
+        { "", "a", "aaaaa", "aaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" },
+        // Failing strings
+        { "b", "ab", "aab", "ba", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab" });
+    test_grammar(
+        "+ quantifier",
+        // Grammar
+        R"""(start: "a"+)""",
+        // Passing strings
+        { "a", "aaaaa", "aaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" },
+        // Failing strings
+        { "", "b", "ab", "aab", "ba", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab" });
+    test_grammar("? quantifier",
+                 // Grammar
+                 R"""(start: "a"?)""",
+                 // Passing strings
+                 { "", "a" },
+                 // Failing strings
+                 {
+                     "b",
+                     "ab",
+                     "aa",
+                     "ba",
+                 });
+    test_grammar("mixed quantifiers",
+                 // Grammar
+                 R"""(
+            start: cons+ vowel* cons? (vowel cons)*
+            vowel: /[aeiouy]/
+            cons: /[bcdfghjklmnpqrstvwxyz]/
+            )""",
+                 // Passing strings
+                 {
+                     "yes",
+                     "no",
+                     "noyes",
+                     "crwth",
+                     "four",
+                     "bryyyy",
+                 },
+                 // Failing strings
+                 {
+                     "yess",
+                     "yesno",
+                     "forty",
+                     "catyyy",
+                 });
+    test_grammar("simple exact repetition",
+                 // Grammar
+                 R"""(
+            start: /[ab]{4}/
+        )""",
+                 // Passing strings
+                 {
+                     "aaaa",
+                     "bbbb",
+                     "abab",
+                 },
+                 // Failing strings
+                 {
+                     "a",
+                     "b",
+                     "aaaaa",
+                 });
+    test_grammar("simple min repetition",
+                 // Grammar
+                 R"""(
+            start: /[ab]{4,}/
+        )""",
+                 // Passing strings
+                 {
+                     "aaaa",
+                     "aaaaab",
+                     "bbbb",
+                     "ababab",
+                 },
+                 // Failing strings
+                 {
+                     "",
+                     "aba",
+                 });
+    test_grammar("simple max repetition",
+                 // Grammar
+                 R"""(
+            start: /[ab]{0,4}/
+        )""",
+                 // Passing strings
+                 {
+                     "",
+                     "a",
+                     "aa",
+                     "aaa",
+                     "aaab",
+                 },
+                 // Failing strings
+                 {
+                     "aaaaa",
+                 });
+    // test_grammar("min / max repetition",
+    //              // Grammar
+    //              R"""(
+    //         start: ("0x" /[A-F0-9]{2}/ " "?){3,5}
+    //     )""",
+    //              // Passing strings
+    //              {
+    //                  "0xFF 0x12 0xAB",
+    //                  "0xFF 0x12 0xAB 0x00 0x00",
+    //              },
+    //              // Failing strings
+    //              {
+    //                  "",
+    //                  "0xFF",
+    //                  "0xFF 0x12",
+    //                  "0xFF 0x12 0xAB 0x00 0x00 0x00",
+    //              });
+}
+
+static void test_json_schema() {
+    // Note that this is similar to the regular grammar tests,
+    //  but we convert each json schema to a grammar before parsing.
+    // Otherwise, this test structure is the same.
+
+    test_schema("empty schema (object)",
+                // Schema
+                R"""(
+            {"type":"object"}
+        )""",
+                // Passing strings
+                {
+                    R"""({})""",
+                    R"""({"foo": "bar"})""",
+                },
+                // Failing strings
+                {
+                    "",
+                    "[]",
+                    "null",
+                    R"""("")""",
+                    "true",
+                });
+
+    test_schema(
+        "exotic formats (list)",
+        // Schema
+        R"""({
+            "items": [
+                { "format": "date" },
+                { "format": "uuid" },
+                { "format": "time" },
+                { "format": "date-time" }
+            ]
+        })""",
+        // Passing strings
+        {
+            // "{}", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            // "[]", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            R"""(["2012-04-23", "12345678-1234-1234-1234-1234567890ab", "18:25:43.511Z", "2012-04-23T18:25:43.511Z"])""",
+            //R"""(["2012-04-23","12345678-1234-1234-1234-1234567890ab"])""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+            //R"""({"foo": "bar"})""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
+        },
+        // Failing strings
+        {
+            R"""(["foo", "bar"])""",
+            R"""(["12345678-1234-1234-1234-1234567890ab"])""",
+        });
+
+    test_schema("string",
+                // Schema
+                R"""({
+            "type": "string"
+        })""",
+                // Passing strings
+                {
+                    R"""("foo")""",
+                    R"""("bar")""",
+                    R"""("")""",
+                },
+                // Failing strings
+                {
+                    R"""({})""",
+                    R"""("foo": "bar")""",
+                });
+
+    test_schema("string w/ min length 1",
+                // Schema
+                R"""({
+            "type": "string",
+            "minLength": 1
+        })""",
+                // Passing strings
+                {
+                    R"""("foo")""",
+                    R"""("bar")""",
+                },
+                // Failing strings
+                {
+                    R"""("")""",
+                    R"""({})""",
+                    R"""("foo": "bar")""",
+                });
+
+    test_schema("string w/ min length 3",
+                // Schema
+                R"""({
+                "type": "string",
+                "minLength": 3
+        })""",
+                // Passing strings
+                {
+                    R"""("foo")""",
+                    R"""("bar")""",
+                    R"""("foobar")""",
+                },
+                // Failing strings
+                {
+                    R"""("")""",
+                    R"""("f")""",
+                    R"""("fo")""",
+                });
+
+    test_schema("string w/ max length",
+                // Schema
+                R"""({
+            "type": "string",
+            "maxLength": 3
+        })""",
+                // Passing strings
+                {
+                    R"""("foo")""",
+                    R"""("bar")""",
+                    R"""("")""",
+                    R"""("f")""",
+                    R"""("fo")""",
+                },
+                // Failing strings
+                {
+                    R"""("foobar")""",
+                });
+
+    test_schema("string w/ min & max length",
+                // Schema
+                R"""({
+            "type": "string",
+            "minLength": 1,
+            "maxLength": 4
+        })""",
+                // Passing strings
+                {
+                    R"""("foo")""",
+                    R"""("bar")""",
+                    R"""("f")""",
+                    R"""("barf")""",
+                },
+                // Failing strings
+                {
+                    R"""("")""",
+                    R"""("barfo")""",
+                    R"""("foobar")""",
+                });
+
+    test_schema("boolean",
+                // Schema
+                R"""({
+            "type": "boolean"
+        })""",
+                // Passing strings
+                {
+                    "true",
+                    "false",
+                },
+                // Failing strings
+                {
+                    R"""("")""",
+                    R"""("true")""",
+                    R"""(True)""",
+                    R"""(FALSE)""",
+                });
+
+    test_schema("integer",
+                // Schema
+                R"""({
+            "type": "integer"
+        })""",
+                // Passing strings
+                {
+                    R"""(0)""",
+                    R"""(12345)""",
+                    R"""(1234567890123456)""",
+                },
+                // Failing strings
+                {
+                    R"""()""",
+                    R"""(01)""",
+                    R"""(007)""",
+                    R"""(12345678901234567  )""",
+                });
+
+    test_schema("string const",
+                // Schema
+                R"""({
+            "const": "foo"
+        })""",
+                // Passing strings
+                {
+                    R"""("foo")""",
+                },
+                // Failing strings
+                {
+                    R"""(foo)""",
+                    R"""("bar")""",
+                });
+
+    test_schema("non-string const",
+                // Schema
+                R"""({
+            "const": true
+        })""",
+                // Passing strings
+                {
+                    R"""(true)""",
+                },
+                // Failing strings
+                {
+                    R"""()""",
+                    R"""(foo)""",
+                    R"""("true")""",
+                });
+
+    test_schema("non-string const",
+                // Schema
+                R"""({
+            "enum": ["red", "amber", "green", null, 42, ["foo"]]
+        })""",
+                // Passing strings
+                {
+                    R"""("red")""",
+                    R"""(null)""",
+                    R"""(42)""",
+                    R"""(["foo"])""",
+                },
+                // Failing strings
+                {
+                    R"""()""",
+                    R"""(420)""",
+                    R"""(true)""",
+                    R"""(foo)""",
+                });
+
+    test_schema("simple pattern",
+                // Schema
+                R"""({
+            "pattern": "^[a-zA-Z0-9_-]*$"
+        })""",
+                // Passing strings
+                {
+                    R"""("")""",
+                    R"""("He_llo-12")""",
+                },
+                // Failing strings
+                {
+                    R"""("!")""",
+                    R"""("Hello World")""",
+                });
+
+    test_schema("pattern with escapes",
+                // Schema
+                R"""({
+            "pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$"
+        })""",
+                // Passing strings
+                {
+                    R"""("a^$.[]()|{}*+?b")""",
+                },
+                // Failing strings
+                {
+                    R"""("ab")""",
+                });
+
+    test_schema("",
+                // Schema
+                R"""(
+            {
+                "type": ["array", "null"],
+                "items": { "type": "string" }
+            }
+        )""",
+                // Passing strings
+                {
+                    "null",
+                    "[]",
+                    "[\"123\"]",
+                    "[\"foo\", \"bar\"]",
+                },
+                // Failing strings
+                {
+                    "",
+                    "[123]",
+                    "\"foo\"",
+                    "[\"foo\", 42]",
+                });
+
+    test_schema("min+max items",
+                // Schema
+                R"""({
+            "items": {
+                "type": ["number", "integer"]
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
+                // Passing strings
+                {
+                    R"""([1, 2, 3])""",
+                    R"""([1, 2, 3, 4])""",
+                    R"""([1, 2, 3, 4, 5])""",
+                    // this is in fact correct; keyword do not apply if the type is wrong
+                    R"""(1)""",
+                },
+                // Failing strings
+                {
+                    R"""([1, 2])""",
+                    R"""([1, 2, 3, 4, 5, 6])""",
+                });
+
+    // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
+    test_schema("object properties",
+                // Schema
+                R"""({
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            },
+            "additionalProperties": false
+        })""",
+                // Passing strings
+                {
+                    R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+                    // "By default, leaving out properties is valid"
+                    R"""({ "street_name": "Pennsylvania" })""",
+                    R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+                    // "By extension, even an empty object is valid"
+                    R"""({})""",
+                    R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+                },
+                // Failing strings
+                {
+                    // Change datatype from number to string
+                    R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+                    // Reorder properties
+                    R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
+                    // Reorder properties
+                    R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+                    // Additional properties set to false
+                    R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
+
+                });
+
+    test_schema("additional properties can't override other properties",
+                R"""({
+            "properties": {
+                "a": {"type": "integer"},
+                "b": {"type": "integer"}
+            },
+            "additionalProperties": true
+        })""",
+                // Passing strings
+                {
+                    R"""({"a": 42})""",
+                    R"""({"c": ""})""",
+                    R"""({"a": 42, "c": ""})""",
+                    R"""({"a_": ""})""",
+                },
+                // Failing strings
+                {
+                    R"""()""",
+                    R"""({"a": ""})""",
+                    R"""({"a": "", "b": ""})""",
+                });
+
+    // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
+    test_schema("object properties, additionalProperties: true",
+                // Schema
+                R"""({
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            },
+            "additionalProperties": true
+        })""",
+                // Passing strings
+                {
+                    // "By extension, even an empty object is valid"
+                    R"""({})""",
+                    R"""({"number":1600,"street_name":"Pennsylvania","street_type":"Avenue"})""",
+                    // "By default, leaving out properties is valid"
+                    R"""({ "street_name": "Pennsylvania" })""",
+                    R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+                    // "By default, providing additional properties is valid"
+                    R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
+                    R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+                },
+                // Failing strings
+                {
+                    // Change datatype from number to string
+                    R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+                    // Reorder properties
+                    R"""({ "street_name": "Pennsylvania", "number": 1600, "street_type":"Avenue"})""",
+                });
+
+    // Additional properties: false
+    test_schema(
+        "required + optional props each in original order",
+        // Schema
+        R"""({
+            "type": "object",
+            "properties": {
+                "number": { "type": "number" },
+                "street_name": { "type": "string" },
+                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
+            },
+            "additionalProperties": false
+        })""",
+        // Passing strings
+        {
+            R"""({ "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_type":"Avenue"})""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // Spaces are permitted around enum values
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
+        },
+        // Failing strings
+        {
+            // Reorder properties
+            R"""({ "street_type": "Avenue", "number": 1600 })""",
+            // Add "direction"
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue", "direction": "NW" })""",
+        });
+
+    test_schema("required + optional props each in original order",
+                // Schema
+                R"""({
+            "properties": {
+                "b": {"type": "string"},
+                "a": {"type": "string"},
+                "d": {"type": "string"},
+                "c": {"type": "string"}
+            },
+            "required": ["a", "b"],
+            "additionalProperties": false
+        })""",
+                // Passing strings
+                {
+                    R"""({"b": "foo", "a": "bar"})""",
+                    R"""({"b":"foo","a":"bar","d":"qux"})""",
+                    R"""({"b":"foo", "a":"bar", "d":"qux", "c":"baz"})""",
+                },
+                // Failing strings
+                {
+                    R"""({"a": "foo", "b": "bar"})""",
+                    R"""({"b": "bar"})""",
+                    R"""({"a": "foo", "c": "baz"})""",
+                    R"""({"a":"foo", "b":"bar", "c":"baz", "d":"qux"})""",
+                });
+
+    // NOTE: Example from https://json-schema.org/learn/getting-started-step-by-step#define-required-properties
+    test_schema(
+        "required props",
+        // Schema
+        R"""({
+            "$schema": "https://json-schema.org/draft/2020-12/schema",
+            "$id": "https://example.com/product.schema.json",
+            "title": "Product",
+            "description": "A product from Acme's catalog",
+            "type": "object",
+            "properties": {
+                "productId": {
+                "description": "The unique identifier for a product",
+                "type": "integer"
+                },
+                "productName": {
+                "description": "Name of the product",
+                "type": "string"
+                },
+                "price": {
+                "description": "The price of the product",
+                "type": "number",
+                "exclusiveMinimum": 0
+                },
+                "tags": {
+                "description": "Tags for the product",
+                "type": "array",
+                "items": {
+                    "type": "string"
+                },
+                "minItems": 1,
+                "DISABLED_uniqueItems": true
+                },
+                "dimensions": {
+                "type": "object",
+                "properties": {
+                    "length": {
+                    "type": "number"
+                    },
+                    "width": {
+                    "type": "number"
+                    },
+                    "height": {
+                    "type": "number"
+                    }
+                },
+                "required": [ "length", "width", "height" ]
+                }
+            },
+            "required": [ "productId", "productName", "price" ]
+        })""",
+        // Passing strings
+        {
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50})""",
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"]})""",
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"], "dimensions": {"length": 785, "width": 250.5, "height": -0.359}})""",
+        },
+        // Failing strings
+        {
+            R"""({})""",  // Missing all required properties
+            R"""({"productName": "A green door", "price": 12.50, "productId": 1})""",  // Out of order properties
+            // `exclusiveMinimum` is OK for llg
+            R"""({"productId": 1, "productName": "A green door", "price": -12.50})""",
+            R"""({"productId": 1, "productName": "A green door"})""",  // Missing required property (price)
+            R"""({"productName": "A green door", "price": 12.50})""",  // Missing required property (productId)
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": []})""",  // tags is empty, but minItems is 1
+            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "dimensions": {"length": 785, "width": 250.5, "height": -0.359}, "tags": ["home", "green"]})""",  // Tags and dimensions are out of order
+            // TODO: The following line should fail, but currently it passes. `uniqueItems` is not supported, as it would likely be too difficult to implement.
+            // R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green", "home"]})""",
+        });
+}
+
+static void one_hot(llama_token_data_array & tok_arr, llama_token selected) {
+    auto n_vocab = tok_arr.size;
+
+    tok_arr.selected = -1;
+    tok_arr.sorted   = false;
+    for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
+        tok_arr.data[token_id].id    = token_id;
+        tok_arr.data[token_id].logit = 0.0f;
+    }
+
+    tok_arr.data[selected].logit = 100.0f;
+}
+
+static void test_sampler_chain(void) {
+    auto sparams            = llama_sampler_chain_default_params();
+    sparams.no_perf         = false;
+    llama_sampler * sampler = llama_sampler_chain_init(sparams);
+
+    const auto grammar_data = R"(%llguidance {}
+start: /[A-Z ]*/)";
+
+    llama_sampler_chain_add(sampler, llama_sampler_init_llg(vocab, "lark", grammar_data));
+    llama_sampler_chain_add(sampler, llama_sampler_init_dist(42));
+
+    auto input  = "ALL YOUR BASE ARE BELONG TO US";
+    auto tokens = common_tokenize(vocab, input, false, false);
+
+    auto n_vocab = llama_vocab_n_tokens(vocab);
+
+    std::vector<llama_token_data> cur;
+    cur.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
+        cur.emplace_back(llama_token_data{ token_id, 0.0f, 0.0f });
+    }
+    auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), -1, false };
+
+    for (const auto token : tokens) {
+        one_hot(tok_arr, token);
+
+        fprintf(stderr, "applying token: %d\n", token);
+        llama_sampler_apply(sampler, &tok_arr);
+
+        auto idx = tok_arr.selected;
+        fprintf(stderr, " -> %d %f\n", cur[idx].id, cur[idx].logit);
+        assert(cur[tok_arr.selected].id == token);
+        llama_sampler_accept(sampler, token);
+    }
+
+    auto tok_eos = llama_vocab_eot(vocab);
+    if (tok_eos == LLAMA_TOKEN_NULL) {
+        tok_eos = llama_vocab_eos(vocab);
+    }
+
+    one_hot(tok_arr, tok_eos);
+
+    llama_sampler_apply(sampler, &tok_arr);
+    assert(cur[tok_arr.selected].id == tok_eos);
+}
+
+int main(int argc, const char ** argv) {
+    fprintf(stdout, "Running llguidance integration tests...\n");
+
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+        return 1;
+    }
+
+    const char * vocab_file = argv[1];
+
+    fprintf(stderr, "reading vocab from: '%s'\n", vocab_file);
+
+    llama_model *   model;
+    llama_context * ctx;
+
+    llama_backend_init();
+
+    // load the vocab
+    {
+        auto mparams = llama_model_default_params();
+
+        mparams.vocab_only = true;
+
+        model = llama_model_load_from_file(vocab_file, mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, vocab_file);
+            return 1;
+        }
+
+        // needed?
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_init_from_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, vocab_file);
+            llama_model_free(model);
+            return 1;
+        }
+    }
+
+    vocab = llama_model_get_vocab(model);
+
+    test_simple_grammar();
+    test_complex_grammar();
+    test_special_chars();
+    test_quantifiers();
+    test_json_schema();
+
+    test_sampler_chain();
+
+    llama_free(ctx);
+    llama_model_free(model);
+
+    fprintf(stdout, "All tests passed.\n");
+    return 0;
+}
diff --git a/llama.cpp/tests/test-grammar-parser.cpp b/llama.cpp/tests/test-grammar-parser.cpp
new file mode 100644
index 0000000..03ae78f
--- /dev/null
+++ b/llama.cpp/tests/test-grammar-parser.cpp
@@ -0,0 +1,533 @@
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include "llama.h"
+
+// TODO: shold not include libllama sources
+#include "../src/llama-grammar.h"
+
+#include <cassert>
+
+static const char * type_str(llama_gretype type) {
+    switch (type) {
+        case LLAMA_GRETYPE_CHAR: return "LLAMA_GRETYPE_CHAR";
+        case LLAMA_GRETYPE_CHAR_NOT: return "LLAMA_GRETYPE_CHAR_NOT";
+        case LLAMA_GRETYPE_CHAR_ALT: return "LLAMA_GRETYPE_CHAR_ALT";
+        case LLAMA_GRETYPE_CHAR_RNG_UPPER: return "LLAMA_GRETYPE_CHAR_RNG_UPPER";
+        case LLAMA_GRETYPE_RULE_REF: return "LLAMA_GRETYPE_RULE_REF";
+        case LLAMA_GRETYPE_ALT: return "LLAMA_GRETYPE_ALT";
+        case LLAMA_GRETYPE_END: return "LLAMA_GRETYPE_END";
+        default: return "?";
+    }
+}
+
+static void verify_parsing(const char *grammar_bytes, const std::vector<std::pair<std::string, uint32_t>> expected, const std::vector<llama_grammar_element> &expected_rules) {
+    uint32_t index = 0;
+    llama_grammar_parser parsed_grammar;
+    parsed_grammar.parse(grammar_bytes);
+
+    std::map<uint32_t, std::string> symbol_names;
+    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
+        symbol_names[it->second] = it->first;
+    }
+
+    auto print_all = [&]() {
+        fprintf(stderr, "    verify_parsing(R\"\"\"(%s)\"\"\", {\n", grammar_bytes);
+        for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
+            fprintf(stderr, "        {\"%s\", %u},\n", it->first.c_str(), it->second);
+        }
+        fprintf(stderr, "    }, {\n");
+        for (size_t i_rule = 0; i_rule < parsed_grammar.rules.size(); i_rule++) {
+            fprintf(stderr, "        // %s (index %zu)\n", symbol_names[i_rule].c_str(), i_rule);
+            auto & rule = parsed_grammar.rules[i_rule];
+            for (uint32_t i = 0; i < rule.size(); i++) {
+                std::string rule_str;
+                fprintf(stderr, "        {%s, ", type_str(rule[i].type));
+                if (rule[i].type == LLAMA_GRETYPE_CHAR || rule[i].type == LLAMA_GRETYPE_CHAR_ALT ||
+                    rule[i].type == LLAMA_GRETYPE_CHAR_NOT || rule[i].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+                    char c = rule[i].value;
+                    if (c == '\n') {
+                        fprintf(stderr, "'\\n'");
+                    } else if (c == '\t') {
+                        fprintf(stderr, "'\\t'");
+                    } else if (c == '\r') {
+                        fprintf(stderr, "'\\r'");
+                    } else if (c == '\0') {
+                        fprintf(stderr, "'\\0'");
+                    } else {
+                        fprintf(stderr, "'%c'", c);
+                    }
+                } else if (rule[i].type == LLAMA_GRETYPE_RULE_REF) {
+                    fprintf(stderr, "/* %s */ %u", symbol_names[rule[i].value].c_str(), rule[i].value);
+                } else {
+                    fprintf(stderr, "%u", rule[i].value);
+                }
+                fprintf(stderr, "},\n");
+            }
+        }
+        fprintf(stderr, "    });\n");
+    };
+
+    if (getenv("TEST_GRAMMAR_PARSER_PRINT_ALL")) {
+        print_all();
+        fprintf(stderr, "\n");
+        return;
+    }
+
+    fprintf(stderr, "Testing grammar:%s\n", grammar_bytes);
+
+    if (parsed_grammar.symbol_ids.size() != expected.size()) {
+        fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
+        print_all();
+        assert(parsed_grammar.symbol_ids.size() == expected.size());
+    }
+
+    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
+    {
+        std::string key = it->first;
+        uint32_t value = it->second;
+        std::pair<std::string, uint32_t> expected_pair = expected[index];
+
+        // pretty print error message before asserting
+        if (expected_pair.first != key || expected_pair.second != value)
+        {
+            fprintf(stderr, "index: %u\n", index);
+            fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
+            fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
+            fprintf(stderr, "expected_pair != actual_pair\n");
+            fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
+            print_all();
+        }
+
+        assert(expected_pair.first == key && expected_pair.second == value);
+
+        index++;
+    }
+
+    index = 0;
+    for (auto rule : parsed_grammar.rules)
+    {
+        // compare rule to expected rule
+        for (uint32_t i = 0; i < rule.size(); i++)
+        {
+            llama_grammar_element element = rule[i];
+            llama_grammar_element expected_element = expected_rules[index];
+
+            // pretty print error message before asserting
+            if (expected_element.type != element.type || expected_element.value != element.value)
+            {
+                fprintf(stderr, "index: %u\n", index);
+                fprintf(stderr, "expected_element: %s, %u\n", type_str(expected_element.type), expected_element.value);
+                fprintf(stderr, "actual_element: %s, %u\n", type_str(element.type), element.value);
+                fprintf(stderr, "expected_element != actual_element\n");
+                fprintf(stderr, "all elements:\n");
+                fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
+                print_all();
+            }
+
+            assert(expected_element.type == element.type && expected_element.value == element.value);
+            index++;
+        }
+    }
+}
+
+static void verify_failure(const char * grammar_bytes) {
+    fprintf(stderr, "Testing expected failure:%s\n", grammar_bytes);
+    llama_grammar_parser result;
+    result.parse(grammar_bytes);
+    assert(result.rules.empty() && "should have failed");
+}
+
+int main()
+{
+    verify_failure(R"""(
+        root ::= "a"{,}"
+    )""");
+
+    verify_failure(R"""(
+        root ::= "a"{,10}"
+    )""");
+
+    verify_parsing(R"""(
+        root  ::= "a"
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a" | [bdx-z] | [^1-3]
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR, 'b'},
+        {LLAMA_GRETYPE_CHAR_ALT, 'd'},
+        {LLAMA_GRETYPE_CHAR_ALT, 'x'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR_NOT, '1'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '3'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= a+
+        a     ::= "a"
+    )""", {
+        {"a", 1},
+        {"root", 0},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // a (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"+
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= a?
+        a     ::= "a"
+    )""", {
+        {"a", 1},
+        {"root", 0},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // a (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"?
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= a*
+        a     ::= "a"
+    )""", {
+        {"a", 1},
+        {"root", 0},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // a (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"*
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{2}
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{2,}
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{ 4}
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{2,4}
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= (expr "=" term "\n")+
+        expr  ::= term ([-+*/] term)*
+        term  ::= [0-9]+
+    )""", {
+        {"expr", 2},
+        {"expr_5", 5},
+        {"expr_6", 6},
+        {"root", 0},
+        {"root_1", 1},
+        {"root_4", 4},
+        {"term", 3},
+        {"term_7", 7},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
+        {LLAMA_GRETYPE_CHAR, '='},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
+        {LLAMA_GRETYPE_CHAR, '\n'},
+        {LLAMA_GRETYPE_END, 0},
+        // expr (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {LLAMA_GRETYPE_END, 0},
+        // term (index 3)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
+        {LLAMA_GRETYPE_END, 0},
+        // root_4 (index 4)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // expr_5 (index 5)
+        {LLAMA_GRETYPE_CHAR, '-'},
+        {LLAMA_GRETYPE_CHAR_ALT, '+'},
+        {LLAMA_GRETYPE_CHAR_ALT, '*'},
+        {LLAMA_GRETYPE_CHAR_ALT, '/'},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
+        {LLAMA_GRETYPE_END, 0},
+        // expr_6 (index 6)
+        {LLAMA_GRETYPE_RULE_REF, /* expr_5 */ 5},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // term_7 (index 7)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= (expr "=" ws term "\n")+
+        expr  ::= term ([-+*/] term)*
+        term  ::= ident | num | "(" ws expr ")" ws
+        ident ::= [a-z] [a-z0-9_]* ws
+        num   ::= [0-9]+ ws
+        ws    ::= [ \t\n]*
+    )""", {
+        {"expr", 2},
+        {"expr_6", 6},
+        {"expr_7", 7},
+        {"ident", 8},
+        {"ident_10", 10},
+        {"num", 9},
+        {"num_11", 11},
+        {"root", 0},
+        {"root_1", 1},
+        {"root_5", 5},
+        {"term", 4},
+        {"ws", 3},
+        {"ws_12", 12},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
+        {LLAMA_GRETYPE_CHAR, '='},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
+        {LLAMA_GRETYPE_CHAR, '\n'},
+        {LLAMA_GRETYPE_END, 0},
+        // expr (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
+        {LLAMA_GRETYPE_END, 0},
+        // ws (index 3)
+        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
+        {LLAMA_GRETYPE_END, 0},
+        // term (index 4)
+        {LLAMA_GRETYPE_RULE_REF, /* ident */ 8},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_RULE_REF, /* num */ 9},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR, '('},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
+        {LLAMA_GRETYPE_CHAR, ')'},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_END, 0},
+        // root_5 (index 5)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // expr_6 (index 6)
+        {LLAMA_GRETYPE_CHAR, '-'},
+        {LLAMA_GRETYPE_CHAR_ALT, '+'},
+        {LLAMA_GRETYPE_CHAR_ALT, '*'},
+        {LLAMA_GRETYPE_CHAR_ALT, '/'},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
+        {LLAMA_GRETYPE_END, 0},
+        // expr_7 (index 7)
+        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // ident (index 8)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_END, 0},
+        // num (index 9)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_END, 0},
+        // ident_10 (index 10)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {LLAMA_GRETYPE_CHAR_ALT, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_CHAR_ALT, '_'},
+        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // num_11 (index 11)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // ws_12 (index 12)
+        {LLAMA_GRETYPE_CHAR, ' '},
+        {LLAMA_GRETYPE_CHAR_ALT, '\t'},
+        {LLAMA_GRETYPE_CHAR_ALT, '\n'},
+        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    // <[1000]> = "<think>"
+    // <[1001]> = "</think>"
+    verify_parsing(R"""(
+        root  ::= <[1000]> !<[1001]> <[1001]>
+    )""", {
+        {"root", 0}
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_TOKEN, 1000},
+        {LLAMA_GRETYPE_TOKEN_NOT, 1001},
+        {LLAMA_GRETYPE_TOKEN, 1001},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-jinja.cpp b/llama.cpp/tests/test-jinja.cpp
new file mode 100644
index 0000000..1f25c6a
--- /dev/null
+++ b/llama.cpp/tests/test-jinja.cpp
@@ -0,0 +1,2290 @@
+#include <string>
+#include <iostream>
+#include <random>
+#include <cstdlib>
+
+#include <nlohmann/json.hpp>
+#include <sheredom/subprocess.h>
+
+#include "jinja/runtime.h"
+#include "jinja/parser.h"
+#include "jinja/lexer.h"
+#include "jinja/utils.h"
+
+#include "testing.h"
+
+using json = nlohmann::ordered_json;
+
+static void test_template(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect);
+
+static void test_whitespace_control(testing & t);
+static void test_conditionals(testing & t);
+static void test_loops(testing & t);
+static void test_expressions(testing & t);
+static void test_set_statement(testing & t);
+static void test_filters(testing & t);
+static void test_literals(testing & t);
+static void test_comments(testing & t);
+static void test_macros(testing & t);
+static void test_namespace(testing & t);
+static void test_tests(testing & t);
+static void test_string_methods(testing & t);
+static void test_array_methods(testing & t);
+static void test_object_methods(testing & t);
+static void test_hasher(testing & t);
+static void test_fuzzing(testing & t);
+
+static bool g_python_mode = false;
+
+int main(int argc, char *argv[]) {
+    testing t(std::cout);
+    t.verbose = true;
+
+    // usage: test-jinja [-py] [filter_regex]
+    //  -py : enable python mode (use python jinja2 for rendering expected output)
+    //        only use this for cross-checking, not for correctness
+    //        note: the implementation of this flag is basic, only intented to be used by maintainers
+
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+        if (arg == "-py") {
+            g_python_mode = true;
+        } else {
+            t.set_filter(arg);
+        }
+    }
+
+    t.test("whitespace control", test_whitespace_control);
+    t.test("conditionals", test_conditionals);
+    t.test("loops", test_loops);
+    t.test("expressions", test_expressions);
+    t.test("set statement", test_set_statement);
+    t.test("filters", test_filters);
+    t.test("literals", test_literals);
+    t.test("comments", test_comments);
+    t.test("macros", test_macros);
+    t.test("namespace", test_namespace);
+    t.test("tests", test_tests);
+    t.test("string methods", test_string_methods);
+    t.test("array methods", test_array_methods);
+    t.test("object methods", test_object_methods);
+    if (!g_python_mode) {
+        t.test("hasher", test_hasher);
+        t.test("fuzzing", test_fuzzing);
+    }
+
+    return t.summary();
+}
+
+static void test_whitespace_control(testing & t) {
+    test_template(t, "trim_blocks removes newline after tag",
+        "{% if true %}\n"
+        "hello\n"
+        "{% endif %}\n",
+        json::object(),
+        "hello\n"
+    );
+
+    test_template(t, "lstrip_blocks removes leading whitespace",
+        "    {% if true %}\n"
+        "    hello\n"
+        "    {% endif %}\n",
+        json::object(),
+        "    hello\n"
+    );
+
+    test_template(t, "for loop with trim_blocks",
+        "{% for i in items %}\n"
+        "{{ i }}\n"
+        "{% endfor %}\n",
+        {{"items", json::array({1, 2, 3})}},
+        "1\n2\n3\n"
+    );
+
+    test_template(t, "explicit strip both",
+        "  {%- if true -%}  \n"
+        "hello\n"
+        "  {%- endif -%}  \n",
+        json::object(),
+        "hello"
+    );
+
+    test_template(t, "expression whitespace control",
+        "  {{- 'hello' -}}  \n",
+        json::object(),
+        "hello"
+    );
+
+    test_template(t, "inline block no newline",
+        "{% if true %}yes{% endif %}",
+        json::object(),
+        "yes"
+    );
+}
+
+static void test_conditionals(testing & t) {
+    test_template(t, "if true",
+        "{% if cond %}yes{% endif %}",
+        {{"cond", true}},
+        "yes"
+    );
+
+    test_template(t, "if false",
+        "{% if cond %}yes{% endif %}",
+        {{"cond", false}},
+        ""
+    );
+
+    test_template(t, "if else",
+        "{% if cond %}yes{% else %}no{% endif %}",
+        {{"cond", false}},
+        "no"
+    );
+
+    test_template(t, "if elif else",
+        "{% if a %}A{% elif b %}B{% else %}C{% endif %}",
+        {{"a", false}, {"b", true}},
+        "B"
+    );
+
+    test_template(t, "nested if",
+        "{% if outer %}{% if inner %}both{% endif %}{% endif %}",
+        {{"outer", true}, {"inner", true}},
+        "both"
+    );
+
+    test_template(t, "comparison operators",
+        "{% if x > 5 %}big{% endif %}",
+        {{"x", 10}},
+        "big"
+    );
+
+    test_template(t, "object comparison",
+        "{% if {0: 1, none: 2, 1.0: 3, '0': 4, true: 5} == {false: 1, none: 2, 1: 5, '0': 4} %}equal{% endif %}",
+        json::object(),
+        "equal"
+    );
+
+    test_template(t, "array comparison",
+        "{% if [0, 1.0, false] == [false, 1, 0.0] %}equal{% endif %}",
+        json::object(),
+        "equal"
+    );
+
+    test_template(t, "logical and",
+        "{% if a and b %}both{% endif %}",
+        {{"a", true}, {"b", true}},
+        "both"
+    );
+
+    test_template(t, "logical or",
+        "{% if a or b %}either{% endif %}",
+        {{"a", false}, {"b", true}},
+        "either"
+    );
+
+    test_template(t, "logical not",
+        "{% if not a %}negated{% endif %}",
+        {{"a", false}},
+        "negated"
+    );
+
+    test_template(t, "in operator (element in array)",
+        "{% if 'x' in items %}found{% endif %}",
+        {{"items", json::array({"x", "y"})}},
+        "found"
+    );
+
+    test_template(t, "in operator (substring)",
+        "{% if 'bc' in 'abcd' %}found{% endif %}",
+        json::object(),
+        "found"
+    );
+
+    test_template(t, "in operator (object key)",
+        "{% if 'key' in obj %}found{% endif %}",
+        {{"obj", {{"key", 1}, {"other", 2}}}},
+        "found"
+    );
+
+    test_template(t, "is defined",
+        "{% if x is defined %}yes{% else %}no{% endif %}",
+        {{"x", 1}},
+        "yes"
+    );
+
+    test_template(t, "is not defined",
+        "{% if y is not defined %}yes{% else %}no{% endif %}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is undefined falsy",
+        "{{ 'yes' if not y else 'no' }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is undefined attribute falsy",
+        "{{ 'yes' if not y.x else 'no' }}",
+        {{"y", true}},
+        "yes"
+    );
+
+    test_template(t, "is undefined key falsy",
+        "{{ 'yes' if not y['x'] else 'no' }}",
+        {{"y", {{}}}},
+        "yes"
+    );
+
+    test_template(t, "is empty array falsy",
+        "{{ 'yes' if not y else 'no' }}",
+        {{"y", json::array()}},
+        "yes"
+    );
+
+    test_template(t, "is empty object falsy",
+        "{{ 'yes' if not y else 'no' }}",
+        {{"y", json::object()}},
+        "yes"
+    );
+
+    test_template(t, "is empty string falsy",
+        "{{ 'yes' if not y else 'no' }}",
+        {{"y", ""}},
+        "yes"
+    );
+
+    test_template(t, "is 0 falsy",
+        "{{ 'yes' if not y else 'no' }}",
+        {{"y", 0}},
+        "yes"
+    );
+
+    test_template(t, "is 0.0 falsy",
+        "{{ 'yes' if not y else 'no' }}",
+        {{"y", 0.0}},
+        "yes"
+    );
+
+    test_template(t, "is non-empty array truthy",
+        "{{ 'yes' if y else 'no' }}",
+        {{"y", json::array({""})}},
+        "yes"
+    );
+
+    test_template(t, "is non-empty object truthy",
+        "{{ 'yes' if y else 'no' }}",
+        {{"y", {"x", false}}},
+        "yes"
+    );
+
+    test_template(t, "is non-empty string truthy",
+        "{{ 'yes' if y else 'no' }}",
+        {{"y", "0"}},
+        "yes"
+    );
+
+    test_template(t, "is 1 truthy",
+        "{{ 'yes' if y else 'no' }}",
+        {{"y", 1}},
+        "yes"
+    );
+
+    test_template(t, "is 1.0 truthy",
+        "{{ 'yes' if y else 'no' }}",
+        {{"y", 1.0}},
+        "yes"
+    );
+}
+
+static void test_loops(testing & t) {
+    test_template(t, "simple for",
+        "{% for i in items %}{{ i }}{% endfor %}",
+        {{"items", json::array({1, 2, 3})}},
+        "123"
+    );
+
+    test_template(t, "loop.index",
+        "{% for i in items %}{{ loop.index }}{% endfor %}",
+        {{"items", json::array({"a", "b", "c"})}},
+        "123"
+    );
+
+    test_template(t, "loop.index0",
+        "{% for i in items %}{{ loop.index0 }}{% endfor %}",
+        {{"items", json::array({"a", "b", "c"})}},
+        "012"
+    );
+
+    test_template(t, "loop.first and loop.last",
+        "{% for i in items %}{% if loop.first %}[{% endif %}{{ i }}{% if loop.last %}]{% endif %}{% endfor %}",
+        {{"items", json::array({1, 2, 3})}},
+        "[123]"
+    );
+
+    test_template(t, "loop.length",
+        "{% for i in items %}{{ loop.length }}{% endfor %}",
+        {{"items", json::array({"a", "b"})}},
+        "22"
+    );
+
+    test_template(t, "for over dict items",
+        "{% for k, v in data.items() %}{{ k }}={{ v }} {% endfor %}",
+        {{"data", {{"x", 1}, {"y", 2}}}},
+        "x=1 y=2 "
+    );
+
+    test_template(t, "for else empty",
+        "{% for i in items %}{{ i }}{% else %}empty{% endfor %}",
+        {{"items", json::array()}},
+        "empty"
+    );
+
+    test_template(t, "for undefined empty",
+        "{% for i in items %}{{ i }}{% else %}empty{% endfor %}",
+        json::object(),
+        "empty"
+    );
+
+    test_template(t, "nested for",
+        "{% for i in a %}{% for j in b %}{{ i }}{{ j }}{% endfor %}{% endfor %}",
+        {{"a", json::array({1, 2})}, {"b", json::array({"x", "y"})}},
+        "1x1y2x2y"
+    );
+
+    test_template(t, "for with range",
+        "{% for i in range(3) %}{{ i }}{% endfor %}",
+        json::object(),
+        "012"
+    );
+}
+
+static void test_expressions(testing & t) {
+    test_template(t, "simple variable",
+        "{{ x }}",
+        {{"x", 42}},
+        "42"
+    );
+
+    test_template(t, "dot notation",
+        "{{ user.name }}",
+        {{"user", {{"name", "Bob"}}}},
+        "Bob"
+    );
+
+    test_template(t, "negative float (not dot notation)",
+        "{{ -1.0 }}",
+        json::object(),
+        "-1.0"
+    );
+
+    test_template(t, "bracket notation",
+        "{{ user['name'] }}",
+        {{"user", {{"name", "Bob"}}}},
+        "Bob"
+    );
+
+    test_template(t, "array access",
+        "{{ items[1] }}",
+        {{"items", json::array({"a", "b", "c"})}},
+        "b"
+    );
+
+    test_template(t, "array negative access",
+        "{{ items[-1] }}",
+        {{"items", json::array({"a", "b", "c"})}},
+        "c"
+    );
+
+    test_template(t, "array slice",
+        "{{ items[1:-1]|string }}",
+        {{"items", json::array({"a", "b", "c"})}},
+        "['b']"
+    );
+
+    test_template(t, "array slice step",
+        "{{ items[::2]|string }}",
+        {{"items", json::array({"a", "b", "c"})}},
+        "['a', 'c']"
+    );
+
+    test_template(t, "tuple slice",
+        "{{ ('a', 'b', 'c')[::-1]|string }}",
+        json::object(),
+        "('c', 'b', 'a')"
+    );
+
+    test_template(t, "arithmetic",
+        "{{ (a + b) * c }}",
+        {{"a", 2}, {"b", 3}, {"c", 4}},
+        "20"
+    );
+
+    test_template(t, "string concat ~",
+        "{{ 'hello' ~ ' ' ~ 'world' }}",
+        json::object(),
+        "hello world"
+    );
+
+    test_template(t, "ternary",
+        "{{ 'yes' if cond else 'no' }}",
+        {{"cond", true}},
+        "yes"
+    );
+}
+
+static void test_set_statement(testing & t) {
+    test_template(t, "simple set",
+        "{% set x = 5 %}{{ x }}",
+        json::object(),
+        "5"
+    );
+
+    test_template(t, "set with expression",
+        "{% set x = a + b %}{{ x }}",
+        {{"a", 10}, {"b", 20}},
+        "30"
+    );
+
+    test_template(t, "set list",
+        "{% set items = [1, 2, 3] %}{{ items|length }}",
+        json::object(),
+        "3"
+    );
+
+    test_template(t, "set dict",
+        "{% set d = {'a': 1} %}{{ d.a }}",
+        json::object(),
+        "1"
+    );
+
+    test_template(t, "set dict with mixed type keys",
+        "{% set d = {0: 1, none: 2, 1.0: 3, '0': 4, (0, 0): 5, false: 6, 1: 7} %}{{ d[(0, 0)] + d[0] + d[none] + d['0'] + d[false] + d[1.0] + d[1] }}",
+        json::object(),
+        "37"
+    );
+
+    test_template(t, "print dict with mixed type keys",
+        "{% set d = {0: 1, none: 2, 1.0: 3, '0': 4, (0, 0): 5, true: 6} %}{{ d|string }}",
+        json::object(),
+        "{0: 1, None: 2, 1.0: 6, '0': 4, (0, 0): 5}"
+    );
+
+    test_template(t, "print array with mixed types",
+        "{% set d = [0, none, 1.0, '0', true, (0, 0)] %}{{ d|string }}",
+        json::object(),
+        "[0, None, 1.0, '0', True, (0, 0)]"
+    );
+
+    test_template(t, "object member assignment with mixed key types",
+        "{% set d = namespace() %}{% set d.a = 123 %}{{ d['a'] == 123 }}",
+        json::object(),
+        "True"
+    );
+
+    test_template(t, "tuple unpacking",
+        "{% set t = (1, 2, 3) %}{% set a, b, c = t %}{{ a + b + c }}",
+        json::object(),
+        "6"
+    );
+}
+
+static void test_filters(testing & t) {
+    test_template(t, "upper",
+        "{{ 'hello'|upper }}",
+        json::object(),
+        "HELLO"
+    );
+
+    test_template(t, "lower",
+        "{{ 'HELLO'|lower }}",
+        json::object(),
+        "hello"
+    );
+
+    test_template(t, "capitalize",
+        "{{ 'heLlo World'|capitalize }}",
+        json::object(),
+        "Hello world"
+    );
+
+    test_template(t, "title",
+        "{{ 'hello world'|title }}",
+        json::object(),
+        "Hello World"
+    );
+
+    test_template(t, "trim",
+        "{{ '  \r\n\thello\t\n\r  '|trim }}",
+        json::object(),
+        "hello"
+    );
+
+    test_template(t, "trim chars",
+        "{{ 'xyxhelloxyx'|trim('xy') }}",
+        json::object(),
+        "hello"
+    );
+
+    test_template(t, "length string",
+        "{{ 'hello'|length }}",
+        json::object(),
+        "5"
+    );
+
+    test_template(t, "replace",
+        "{{ 'hello world'|replace('world', 'jinja') }}",
+        json::object(),
+        "hello jinja"
+    );
+
+    test_template(t, "length list",
+        "{{ items|length }}",
+        {{"items", json::array({1, 2, 3})}},
+        "3"
+    );
+
+    test_template(t, "first",
+        "{{ items|first }}",
+        {{"items", json::array({10, 20, 30})}},
+        "10"
+    );
+
+    test_template(t, "last",
+        "{{ items|last }}",
+        {{"items", json::array({10, 20, 30})}},
+        "30"
+    );
+
+    test_template(t, "reverse",
+        "{% for i in items|reverse %}{{ i }}{% endfor %}",
+        {{"items", json::array({1, 2, 3})}},
+        "321"
+    );
+
+    test_template(t, "sort",
+        "{% for i in items|sort %}{{ i }}{% endfor %}",
+        {{"items", json::array({3, 1, 2})}},
+        "123"
+    );
+
+    test_template(t, "sort reverse",
+        "{% for i in items|sort(true) %}{{ i }}{% endfor %}",
+        {{"items", json::array({3, 1, 2})}},
+        "321"
+    );
+
+    test_template(t, "sort with attribute",
+        "{{ items|sort(attribute='name')|join(attribute='age') }}",
+        {{"items", json::array({
+            json({{"name", "c"}, {"age", 3}}),
+            json({{"name", "a"}, {"age", 1}}),
+            json({{"name", "b"}, {"age", 2}}),
+        })}},
+        "123"
+    );
+
+    test_template(t, "sort with numeric attribute",
+        "{{ items|sort(attribute=0)|join(attribute=1) }}",
+        {{"items", json::array({
+            json::array({3, "z"}),
+            json::array({1, "x"}),
+            json::array({2, "y"}),
+        })}},
+        "xyz"
+    );
+
+    test_template(t, "join",
+        "{{ items|join(', ') }}",
+        {{"items", json::array({"a", "b", "c"})}},
+        "a, b, c"
+    );
+
+    test_template(t, "join default separator",
+        "{{ items|join }}",
+        {{"items", json::array({"x", "y", "z"})}},
+        "xyz"
+    );
+
+    test_template(t, "abs",
+        "{{ -5|abs }}",
+        json::object(),
+        "5"
+    );
+
+    test_template(t, "int from string",
+        "{{ '42'|int }}",
+        json::object(),
+        "42"
+    );
+
+    test_template(t, "int from string with default",
+        "{{ ''|int(1) }}",
+        json::object(),
+        "1"
+    );
+
+    test_template(t, "int from string with base",
+        "{{ '11'|int(base=2) }}",
+        json::object(),
+        "3"
+    );
+
+    test_template(t, "float from string",
+        "{{ '3.14'|float }}",
+        json::object(),
+        "3.14"
+    );
+
+    test_template(t, "default with value",
+        "{{ x|default('fallback') }}",
+        {{"x", "actual"}},
+        "actual"
+    );
+
+    test_template(t, "default without value",
+        "{{ y|default('fallback') }}",
+        json::object(),
+        "fallback"
+    );
+
+    test_template(t, "default with falsy value",
+        "{{ ''|default('fallback', true) }}",
+        json::object(),
+        "fallback"
+    );
+
+    test_template(t, "tojson ensure_ascii=true",
+        "{{ data|tojson(ensure_ascii=true) }}",
+        {{"data", "\u2713"}},
+        "\"\\u2713\""
+    );
+
+    test_template(t, "tojson sort_keys=true",
+        "{{ data|tojson(sort_keys=true) }}",
+        {{"data", {{"b", 2}, {"a", 1}}}},
+        "{\"a\": 1, \"b\": 2}"
+    );
+
+    test_template(t, "tojson",
+        "{{ data|tojson }}",
+        {{"data", {{"a", 1}, {"b", json::array({1, 2})}}}},
+        "{\"a\": 1, \"b\": [1, 2]}"
+    );
+
+    test_template(t, "tojson indent=4",
+        "{{ data|tojson(indent=4) }}",
+        {{"data", {{"a", 1}, {"b", json::array({1, 2})}}}},
+        "{\n    \"a\": 1,\n    \"b\": [\n        1,\n        2\n    ]\n}"
+    );
+
+    test_template(t, "tojson separators=(',',':')",
+        "{{ data|tojson(separators=(',',':')) }}",
+        {{"data", {{"a", 1}, {"b", json::array({1, 2})}}}},
+        "{\"a\":1,\"b\":[1,2]}"
+    );
+
+    test_template(t, "tojson separators=(',',': ') indent=2",
+        "{{ data|tojson(separators=(',',': '), indent=2) }}",
+        {{"data", {{"a", 1}, {"b", json::array({1, 2})}}}},
+        "{\n  \"a\": 1,\n  \"b\": [\n    1,\n    2\n  ]\n}"
+    );
+
+    test_template(t, "chained filters",
+        "{{ '  HELLO  '|trim|lower }}",
+        json::object(),
+        "hello"
+    );
+
+    test_template(t, "none to string",
+        "{{ x|string }}",
+        {{"x", nullptr}},
+        "None"
+    );
+}
+
+static void test_literals(testing & t) {
+    test_template(t, "integer",
+        "{{ 42 }}",
+        json::object(),
+        "42"
+    );
+
+    test_template(t, "float",
+        "{{ 3.14 }}",
+        json::object(),
+        "3.14"
+    );
+
+    test_template(t, "string",
+        "{{ 'hello' }}",
+        json::object(),
+        "hello"
+    );
+
+    test_template(t, "boolean true",
+        "{{ true }}",
+        json::object(),
+        "True"
+    );
+
+    test_template(t, "boolean false",
+        "{{ false }}",
+        json::object(),
+        "False"
+    );
+
+    test_template(t, "none",
+        "{% if x is none %}null{% endif %}",
+        {{"x", nullptr}},
+        "null"
+    );
+
+    test_template(t, "list literal",
+        "{% for i in [1, 2, 3] %}{{ i }}{% endfor %}",
+        json::object(),
+        "123"
+    );
+
+    test_template(t, "dict literal",
+        "{% set d = {'a': 1} %}{{ d.a }}",
+        json::object(),
+        "1"
+    );
+
+    test_template(t, "integer|abs",
+        "{{ -42 | abs }}",
+        json::object(),
+        "42"
+    );
+
+    test_template(t, "integer|float",
+        "{{ 42 | float }}",
+        json::object(),
+        "42.0"
+    );
+
+    test_template(t, "integer|tojson",
+        "{{ 42 | tojson }}",
+        json::object(),
+        "42"
+    );
+
+    test_template(t, "float|abs",
+        "{{ -3.14 | abs }}",
+        json::object(),
+        "3.14"
+    );
+
+    test_template(t, "float|int",
+        "{{ 3.14 | int }}",
+        json::object(),
+        "3"
+    );
+
+    test_template(t, "float|tojson",
+        "{{ 3.14 | tojson }}",
+        json::object(),
+        "3.14"
+    );
+
+    test_template(t, "string|tojson",
+        "{{ 'hello' | tojson }}",
+        json::object(),
+        "\"hello\""
+    );
+
+    test_template(t, "boolean|int",
+        "{{ true | int }}",
+        json::object(),
+        "1"
+    );
+
+    test_template(t, "boolean|float",
+        "{{ true | float }}",
+        json::object(),
+        "1.0"
+    );
+
+    test_template(t, "boolean|tojson",
+        "{{ true | tojson }}",
+        json::object(),
+        "true"
+    );
+}
+
+static void test_comments(testing & t) {
+    test_template(t, "inline comment",
+        "before{# comment #}after",
+        json::object(),
+        "beforeafter"
+    );
+
+    test_template(t, "comment ignores code",
+        "{% set x = 1 %}{# {% set x = 999 %} #}{{ x }}",
+        json::object(),
+        "1"
+    );
+}
+
+static void test_macros(testing & t) {
+    test_template(t, "simple macro",
+        "{% macro greet(name) %}Hello {{ name }}{% endmacro %}{{ greet('World') }}",
+        json::object(),
+        "Hello World"
+    );
+
+    test_template(t, "macro default arg",
+        "{% macro greet(name='Guest') %}Hi {{ name }}{% endmacro %}{{ greet() }}",
+        json::object(),
+        "Hi Guest"
+    );
+}
+
+static void test_namespace(testing & t) {
+    test_template(t, "namespace counter",
+        "{% set ns = namespace(count=0) %}{% for i in range(3) %}{% set ns.count = ns.count + 1 %}{% endfor %}{{ ns.count }}",
+        json::object(),
+        "3"
+    );
+}
+
+static void test_tests(testing & t) {
+    test_template(t, "is odd",
+        "{% if 3 is odd %}yes{% endif %}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is even",
+        "{% if 4 is even %}yes{% endif %}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is false",
+        "{{ 'yes' if x is false }}",
+        {{"x", false}},
+        "yes"
+    );
+
+    test_template(t, "is true",
+        "{{ 'yes' if x is true }}",
+        {{"x", true}},
+        "yes"
+    );
+
+    test_template(t, "string is false",
+        "{{ 'yes' if x is false else 'no' }}",
+        {{"x", ""}},
+        "no"
+    );
+
+    test_template(t, "is divisibleby",
+        "{{ 'yes' if x is divisibleby(2) }}",
+        {{"x", 2}},
+        "yes"
+    );
+
+    test_template(t, "is eq",
+        "{{ 'yes' if 3 is eq(3) }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is not equalto",
+        "{{ 'yes' if 3 is not equalto(4) }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is ge",
+        "{{ 'yes' if 3 is ge(3) }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is gt",
+        "{{ 'yes' if 3 is gt(2) }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is greaterthan",
+        "{{ 'yes' if 3 is greaterthan(2) }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is lt",
+        "{{ 'yes' if 2 is lt(3) }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is lessthan",
+        "{{ 'yes' if 2 is lessthan(3) }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is ne",
+        "{{ 'yes' if 2 is ne(3) }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is lower",
+        "{{ 'yes' if 'lowercase' is lower }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is upper",
+        "{{ 'yes' if 'UPPERCASE' is upper }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is sameas",
+        "{{ 'yes' if x is sameas(false) }}",
+        {{"x", false}},
+        "yes"
+    );
+
+    test_template(t, "is boolean",
+        "{{ 'yes' if x is boolean }}",
+        {{"x", true}},
+        "yes"
+    );
+
+    test_template(t, "is callable",
+        "{{ 'yes' if ''.strip is callable }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is escaped",
+        "{{ 'yes' if 'foo'|safe is escaped }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is filter",
+        "{{ 'yes' if 'trim' is filter }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is float",
+        "{{ 'yes' if x is float }}",
+        {{"x", 1.1}},
+        "yes"
+    );
+
+    test_template(t, "is integer",
+        "{{ 'yes' if x is integer }}",
+        {{"x", 1}},
+        "yes"
+    );
+
+    test_template(t, "is sequence",
+        "{{ 'yes' if x is sequence }}",
+        {{"x", json::array({1, 2, 3})}},
+        "yes"
+    );
+
+    test_template(t, "is test",
+        "{{ 'yes' if 'sequence' is test }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is undefined",
+        "{{ 'yes' if x is undefined }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is none",
+        "{% if x is none %}yes{% endif %}",
+        {{"x", nullptr}},
+        "yes"
+    );
+
+    test_template(t, "is string",
+        "{% if x is string %}yes{% endif %}",
+        {{"x", "hello"}},
+        "yes"
+    );
+
+    test_template(t, "is number",
+        "{% if x is number %}yes{% endif %}",
+        {{"x", 42}},
+        "yes"
+    );
+
+    test_template(t, "is iterable",
+        "{% if x is iterable %}yes{% endif %}",
+        {{"x", json::array({1, 2, 3})}},
+        "yes"
+    );
+
+    test_template(t, "is mapping",
+        "{% if x is mapping %}yes{% endif %}",
+        {{"x", {{"a", 1}}}},
+        "yes"
+    );
+
+    test_template(t, "undefined is sequence",
+        "{{ 'yes' if x is sequence }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "undefined is iterable",
+        "{{ 'yes' if x is iterable }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is in (array, true)",
+        "{{ 'yes' if 2 is in([1, 2, 3]) }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is in (array, false)",
+        "{{ 'yes' if 5 is in([1, 2, 3]) else 'no' }}",
+        json::object(),
+        "no"
+    );
+
+    test_template(t, "is in (string)",
+        "{{ 'yes' if 'bc' is in('abcde') }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is in (object keys)",
+        "{{ 'yes' if 'a' is in(obj) }}",
+        {{"obj", {{"a", 1}, {"b", 2}}}},
+        "yes"
+    );
+
+    test_template(t, "reject with in test",
+        "{{ items | reject('in', skip) | join(', ') }}",
+        {{"items", json::array({"a", "b", "c", "d"})}, {"skip", json::array({"b", "d"})}},
+        "a, c"
+    );
+
+    test_template(t, "select with in test",
+        "{{ items | select('in', keep) | join(', ') }}",
+        {{"items", json::array({"a", "b", "c", "d"})}, {"keep", json::array({"b", "c"})}},
+        "b, c"
+    );
+}
+
+static void test_string_methods(testing & t) {
+    test_template(t, "string.upper()",
+        "{{ s.upper() }}",
+        {{"s", "hello"}},
+        "HELLO"
+    );
+
+    test_template(t, "string.lower()",
+        "{{ s.lower() }}",
+        {{"s", "HELLO"}},
+        "hello"
+    );
+
+    test_template(t, "string.strip()",
+        "[{{ s.strip() }}]",
+        {{"s", "  hello  "}},
+        "[hello]"
+    );
+
+    test_template(t, "string.lstrip()",
+        "[{{ s.lstrip() }}]",
+        {{"s", "   hello"}},
+        "[hello]"
+    );
+
+    test_template(t, "string.rstrip()",
+        "[{{ s.rstrip() }}]",
+        {{"s", "hello   "}},
+        "[hello]"
+    );
+
+    test_template(t, "string.title()",
+        "{{ s.title() }}",
+        {{"s", "hello world"}},
+        "Hello World"
+    );
+
+    test_template(t, "string.capitalize()",
+        "{{ s.capitalize() }}",
+        {{"s", "heLlo World"}},
+        "Hello world"
+    );
+
+    test_template(t, "string.startswith() true",
+        "{% if s.startswith('hel') %}yes{% endif %}",
+        {{"s", "hello"}},
+        "yes"
+    );
+
+    test_template(t, "string.startswith() false",
+        "{% if s.startswith('xyz') %}yes{% else %}no{% endif %}",
+        {{"s", "hello"}},
+        "no"
+    );
+
+    test_template(t, "string.endswith() true",
+        "{% if s.endswith('lo') %}yes{% endif %}",
+        {{"s", "hello"}},
+        "yes"
+    );
+
+    test_template(t, "string.endswith() false",
+        "{% if s.endswith('xyz') %}yes{% else %}no{% endif %}",
+        {{"s", "hello"}},
+        "no"
+    );
+
+    test_template(t, "string.split() with sep",
+        "{{ s.split(',')|join('-') }}",
+        {{"s", "a,b,c"}},
+        "a-b-c"
+    );
+
+    test_template(t, "string.split() with maxsplit",
+        "{{ s.split(',', 1)|join('-') }}",
+        {{"s", "a,b,c"}},
+        "a-b,c"
+    );
+
+    test_template(t, "string.rsplit() with sep",
+        "{{ s.rsplit(',')|join('-') }}",
+        {{"s", "a,b,c"}},
+        "a-b-c"
+    );
+
+    test_template(t, "string.rsplit() with maxsplit",
+        "{{ s.rsplit(',', 1)|join('-') }}",
+        {{"s", "a,b,c"}},
+        "a,b-c"
+    );
+
+    test_template(t, "string.replace() basic",
+        "{{ s.replace('world', 'jinja') }}",
+        {{"s", "hello world"}},
+        "hello jinja"
+    );
+
+    test_template(t, "string.replace() with count",
+        "{{ s.replace('a', 'X', 2) }}",
+        {{"s", "banana"}},
+        "bXnXna"
+    );
+
+    test_template(t, "undefined|capitalize",
+        "{{ arr|capitalize }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|title",
+        "{{ arr|title }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|truncate",
+        "{{ arr|truncate(9) }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|upper",
+        "{{ arr|upper }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|lower",
+        "{{ arr|lower }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|replace",
+        "{{ arr|replace('a', 'b') }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|trim",
+        "{{ arr|trim }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|wordcount",
+        "{{ arr|wordcount }}",
+        json::object(),
+        "0"
+    );
+}
+
+static void test_array_methods(testing & t) {
+    test_template(t, "array|selectattr by attribute",
+        "{% for item in items|selectattr('active') %}{{ item.name }} {% endfor %}",
+        {{"items", json::array({
+            {{"name", "a"}, {"active", true}},
+            {{"name", "b"}, {"active", false}},
+            {{"name", "c"}, {"active", true}}
+        })}},
+        "a c "
+    );
+
+    test_template(t, "array|selectattr with operator",
+        "{% for item in items|selectattr('value', 'equalto', 5) %}{{ item.name }} {% endfor %}",
+        {{"items", json::array({
+            {{"name", "a"}, {"value", 3}},
+            {{"name", "b"}, {"value", 5}},
+            {{"name", "c"}, {"value", 5}}
+        })}},
+        "b c "
+    );
+
+    test_template(t, "array|tojson",
+        "{{ arr|tojson }}",
+        {{"arr", json::array({1, 2, 3})}},
+        "[1, 2, 3]"
+    );
+
+    test_template(t, "array|tojson with strings",
+        "{{ arr|tojson }}",
+        {{"arr", json::array({"a", "b", "c"})}},
+        "[\"a\", \"b\", \"c\"]"
+    );
+
+    test_template(t, "array|tojson nested",
+        "{{ arr|tojson }}",
+        {{"arr", json::array({json::array({1, 2}), json::array({3, 4})})}},
+        "[[1, 2], [3, 4]]"
+    );
+
+    test_template(t, "array|last",
+        "{{ arr|last }}",
+        {{"arr", json::array({10, 20, 30})}},
+        "30"
+    );
+
+    test_template(t, "array|last single element",
+        "{{ arr|last }}",
+        {{"arr", json::array({42})}},
+        "42"
+    );
+
+    test_template(t, "array|join with separator",
+        "{{ arr|join(', ') }}",
+        {{"arr", json::array({"a", "b", "c"})}},
+        "a, b, c"
+    );
+
+    test_template(t, "array|join with custom separator",
+        "{{ arr|join(' | ') }}",
+        {{"arr", json::array({1, 2, 3})}},
+        "1 | 2 | 3"
+    );
+
+    test_template(t, "array|join default separator",
+        "{{ arr|join }}",
+        {{"arr", json::array({"x", "y", "z"})}},
+        "xyz"
+    );
+
+    test_template(t, "array|join attribute",
+        "{{ arr|join(attribute='age') }}",
+        {{"arr", json::array({
+            json({{"name", "a"}, {"age", 1}}),
+            json({{"name", "b"}, {"age", 2}}),
+            json({{"name", "c"}, {"age", 3}}),
+        })}},
+        "123"
+    );
+
+    test_template(t, "array|join numeric attribute",
+        "{{ arr|join(attribute=-1) }}",
+        {{"arr", json::array({json::array({1}), json::array({2}), json::array({3})})}},
+        "123"
+    );
+
+    test_template(t, "array.pop() last",
+        "{{ arr.pop() }}-{{ arr|join(',') }}",
+        {{"arr", json::array({"a", "b", "c"})}},
+        "c-a,b"
+    );
+
+    test_template(t, "array.pop() with index",
+        "{{ arr.pop(0) }}-{{ arr|join(',') }}",
+        {{"arr", json::array({"a", "b", "c"})}},
+        "a-b,c"
+    );
+
+    test_template(t, "array.append()",
+        "{% set _ = arr.append('d') %}{{ arr|join(',') }}",
+        {{"arr", json::array({"a", "b", "c"})}},
+        "a,b,c,d"
+    );
+
+    test_template(t, "array|map with attribute",
+        "{% for v in arr|map(attribute='age') %}{{ v }} {% endfor %}",
+        {{"arr", json::array({
+            json({{"name", "a"}, {"age", 1}}),
+            json({{"name", "b"}, {"age", 2}}),
+            json({{"name", "c"}, {"age", 3}}),
+        })}},
+        "1 2 3 "
+    );
+
+    test_template(t, "array|map with attribute default",
+        "{% for v in arr|map(attribute='age', default=3) %}{{ v }} {% endfor %}",
+        {{"arr", json::array({
+            json({{"name", "a"}, {"age", 1}}),
+            json({{"name", "b"}, {"age", 2}}),
+            json({{"name", "c"}}),
+        })}},
+        "1 2 3 "
+    );
+
+    test_template(t, "array|map without attribute default",
+        "{% for v in arr|map(attribute='age') %}{{ v }} {% endfor %}",
+        {{"arr", json::array({
+            json({{"name", "a"}, {"age", 1}}),
+            json({{"name", "b"}, {"age", 2}}),
+            json({{"name", "c"}}),
+        })}},
+        "1 2  "
+    );
+
+    test_template(t, "array|map with numeric attribute",
+        "{% for v in arr|map(attribute=0) %}{{ v }} {% endfor %}",
+        {{"arr", json::array({
+            json::array({10, "x"}),
+            json::array({20, "y"}),
+            json::array({30, "z"}),
+        })}},
+        "10 20 30 "
+    );
+
+    test_template(t, "array|map with negative attribute",
+        "{% for v in arr|map(attribute=-1) %}{{ v }} {% endfor %}",
+        {{"arr", json::array({
+            json::array({10, "x"}),
+            json::array({20, "y"}),
+            json::array({30, "z"}),
+        })}},
+        "x y z "
+    );
+
+    test_template(t, "array|map with filter",
+        "{{ arr|map('int')|sum }}",
+        {{"arr", json::array({"1", "2", "3"})}},
+        "6"
+    );
+
+    // not used by any chat templates
+    // test_template(t, "array.insert()",
+    //     "{% set _ = arr.insert(1, 'x') %}{{ arr|join(',') }}",
+    //     {{"arr", json::array({"a", "b", "c"})}},
+    //     "a,x,b,c"
+    // );
+
+    test_template(t, "undefined|select",
+        "{% for item in items|select('odd') %}{{ item.name }} {% endfor %}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|selectattr",
+        "{% for item in items|selectattr('active') %}{{ item.name }} {% endfor %}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|reject",
+        "{% for item in items|reject('even') %}{{ item.name }} {% endfor %}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|rejectattr",
+        "{% for item in items|rejectattr('active') %}{{ item.name }} {% endfor %}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|list",
+        "{{ arr|list|string }}",
+        json::object(),
+        "[]"
+    );
+
+    test_template(t, "undefined|string",
+        "{{ arr|string }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|first",
+        "{{ arr|first }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|last",
+        "{{ arr|last }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|length",
+        "{{ arr|length }}",
+        json::object(),
+        "0"
+    );
+
+    test_template(t, "undefined|join",
+        "{{ arr|join }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|sort",
+        "{{ arr|sort|string }}",
+        json::object(),
+        "[]"
+    );
+
+    test_template(t, "undefined|reverse",
+        "{{ arr|reverse|join }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|map",
+        "{% for v in arr|map(attribute='age') %}{{ v }} {% endfor %}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|min",
+        "{{ arr|min }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|max",
+        "{{ arr|max }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|unique",
+        "{{ arr|unique|join }}",
+        json::object(),
+        ""
+    );
+
+    test_template(t, "undefined|sum",
+        "{{ arr|sum }}",
+        json::object(),
+        "0"
+    );
+}
+
+static void test_object_methods(testing & t) {
+    test_template(t, "object.get() existing key",
+        "{{ obj.get('a') }}",
+        {{"obj", {{"a", 1}, {"b", 2}}}},
+        "1"
+    );
+
+    test_template(t, "object.get() missing key",
+        "[{{ obj.get('c') is none }}]",
+        {{"obj", {{"a", 1}}}},
+        "[True]"
+    );
+
+    test_template(t, "object.get() missing key with default",
+        "{{ obj.get('c', 'default') }}",
+        {{"obj", {{"a", 1}}}},
+        "default"
+    );
+
+    test_template(t, "object.items()",
+        "{% for k, v in obj.items() %}{{ k }}={{ v }} {% endfor %}",
+        {{"obj", {{"x", 1}, {"y", 2}}}},
+        "x=1 y=2 "
+    );
+
+    test_template(t, "object.keys()",
+        "{% for k in obj.keys() %}{{ k }} {% endfor %}",
+        {{"obj", {{"a", 1}, {"b", 2}}}},
+        "a b "
+    );
+
+    test_template(t, "object.values()",
+        "{% for v in obj.values() %}{{ v }} {% endfor %}",
+        {{"obj", {{"a", 1}, {"b", 2}}}},
+        "1 2 "
+    );
+
+    test_template(t, "dictsort ascending by key",
+        "{% for k, v in obj|dictsort %}{{ k }}={{ v }} {% endfor %}",
+        {{"obj", {{"z", 2}, {"a", 3}, {"m", 1}}}},
+        "a=3 m=1 z=2 "
+    );
+
+    test_template(t, "dictsort descending by key",
+        "{% for k, v in obj|dictsort(reverse=true) %}{{ k }}={{ v }} {% endfor %}",
+        {{"obj", {{"a", 1}, {"b", 2}, {"c", 3}}}},
+        "c=3 b=2 a=1 "
+    );
+
+    test_template(t, "dictsort by value",
+        "{% for k, v in obj|dictsort(by='value') %}{{ k }}={{ v }} {% endfor %}",
+        {{"obj", {{"a", 3}, {"b", 1}, {"c", 2}}}},
+        "b=1 c=2 a=3 "
+    );
+
+    test_template(t, "dictsort case sensitive",
+        "{% for k, v in obj|dictsort(case_sensitive=true) %}{{ k }}={{ v }} {% endfor %}",
+        {{"obj", {{"a", 1}, {"A", 1}, {"b", 2}, {"B", 2}, {"c", 3}}}},
+        "A=1 B=2 a=1 b=2 c=3 "
+    );
+
+    test_template(t, "object|tojson",
+        "{{ obj|tojson }}",
+        {{"obj", {{"name", "test"}, {"value", 42}}}},
+        "{\"name\": \"test\", \"value\": 42}"
+    );
+
+    test_template(t, "nested object|tojson",
+        "{{ obj|tojson }}",
+        {{"obj", {{"outer", {{"inner", "value"}}}}}},
+        "{\"outer\": {\"inner\": \"value\"}}"
+    );
+
+    test_template(t, "array in object|tojson",
+        "{{ obj|tojson }}",
+        {{"obj", {{"items", json::array({1, 2, 3})}}}},
+        "{\"items\": [1, 2, 3]}"
+    );
+
+    test_template(t, "object attribute and key access",
+        "{{ obj.keys()|join(',') }} vs {{ obj['keys'] }} vs {{ obj.test }}",
+        {{"obj", {{"keys", "value"}, {"test", "attr_value"}}}},
+        "keys,test vs value vs attr_value"
+    );
+
+    test_template(t, "env should not have object methods",
+        "{{ keys is undefined }} {{ obj.keys is defined }}",
+        {{"obj", {{"a", "b"}}}},
+        "True True"
+    );
+
+    test_template(t, "expression as object key",
+        "{% set d = {'ab': 123} %}{{ d['a' + 'b'] == 123 }}",
+        json::object(),
+        "True"
+    );
+
+    test_template(t, "numeric as object key (template: Seed-OSS)",
+        "{% set d = {1: 'a', 2: 'b'} %}{{ d[1] == 'a' and d[2] == 'b' }}",
+        json::object(),
+        "True"
+    );
+
+    test_template(t, "undefined|items",
+        "{{ arr|items|join }}",
+        json::object(),
+        ""
+    );
+}
+
+static void test_hasher(testing & t) {
+    static const std::vector<std::pair<size_t, size_t>> chunk_sizes = {
+        {1, 2},
+        {1, 16},
+        {8, 1},
+        {1, 1024},
+        {5, 512},
+        {16, 256},
+        {45, 122},
+        {70, 634},
+    };
+
+    static auto random_bytes = [](size_t length) -> std::string {
+        std::string data;
+        data.resize(length);
+        for (size_t i = 0; i < length; ++i) {
+            data[i] = static_cast<char>(rand() % 256);
+        }
+        return data;
+    };
+
+    t.test("state unchanged with empty input", [](testing & t) {
+        jinja::hasher hasher;
+        hasher.update("some data");
+        size_t initial_state = hasher.digest();
+        hasher.update("", 0);
+        size_t final_state = hasher.digest();
+        t.assert_true("Hasher state should remain unchanged", initial_state == final_state);
+    });
+
+    t.test("different inputs produce different hashes", [](testing & t) {
+        jinja::hasher hasher1;
+        hasher1.update("data one");
+        size_t hash1 = hasher1.digest();
+
+        jinja::hasher hasher2;
+        hasher2.update("data two");
+        size_t hash2 = hasher2.digest();
+
+        t.assert_true("Different inputs should produce different hashes", hash1 != hash2);
+    });
+
+    t.test("same inputs produce same hashes", [](testing & t) {
+        jinja::hasher hasher1;
+        hasher1.update("consistent data");
+        size_t hash1 = hasher1.digest();
+
+        jinja::hasher hasher2;
+        hasher2.update("consistent data");
+        size_t hash2 = hasher2.digest();
+
+        t.assert_true("Same inputs should produce same hashes", hash1 == hash2);
+    });
+
+    t.test("property: update(a ~ b) == update(a).update(b)", [](testing & t) {
+        for (const auto & [size1, size2] : chunk_sizes) {
+            std::string data1 = random_bytes(size1);
+            std::string data2 = random_bytes(size2);
+
+            jinja::hasher hasher1;
+            hasher1.update(data1);
+            hasher1.update(data2);
+            size_t hash1 = hasher1.digest();
+
+            jinja::hasher hasher2;
+            hasher2.update(data1 + data2);
+            size_t hash2 = hasher2.digest();
+
+            t.assert_true(
+                "Hashing in multiple updates should match single update (" + std::to_string(size1) + ", " + std::to_string(size2) + ")",
+                hash1 == hash2);
+        }
+    });
+
+    t.test("property: update(a ~ b) == update(a).update(b) with more update passes", [](testing & t) {
+        static const std::vector<size_t> sizes = {3, 732, 131, 13, 17, 256, 436, 99, 4};
+
+        jinja::hasher hasher1;
+        jinja::hasher hasher2;
+
+        std::string combined_data;
+        for (size_t size : sizes) {
+            std::string data = random_bytes(size);
+            hasher1.update(data);
+            combined_data += data;
+        }
+
+        hasher2.update(combined_data);
+        size_t hash1 = hasher1.digest();
+        size_t hash2 = hasher2.digest();
+        t.assert_true(
+            "Hashing in multiple updates should match single update with many chunks",
+            hash1 == hash2);
+    });
+
+    t.test("property: non associativity of update", [](testing & t) {
+        for (const auto & [size1, size2] : chunk_sizes) {
+            std::string data1 = random_bytes(size1);
+            std::string data2 = random_bytes(size2);
+
+            jinja::hasher hasher1;
+            hasher1.update(data1);
+            hasher1.update(data2);
+            size_t hash1 = hasher1.digest();
+
+            jinja::hasher hasher2;
+            hasher2.update(data2);
+            hasher2.update(data1);
+            size_t hash2 = hasher2.digest();
+
+            t.assert_true(
+                "Hashing order should matter (" + std::to_string(size1) + ", " + std::to_string(size2) + ")",
+                hash1 != hash2);
+        }
+    });
+
+    t.test("property: different lengths produce different hashes (padding block size)", [](testing & t) {
+        std::string random_data = random_bytes(64);
+
+        jinja::hasher hasher1;
+        hasher1.update(random_data);
+        size_t hash1 = hasher1.digest();
+
+        for (int i = 0; i < 16; ++i) {
+            random_data.push_back('A');  // change length
+            jinja::hasher hasher2;
+            hasher2.update(random_data);
+            size_t hash2 = hasher2.digest();
+
+            t.assert_true("Different lengths should produce different hashes (length " + std::to_string(random_data.size()) + ")", hash1 != hash2);
+
+            hash1 = hash2;
+        }
+    });
+}
+
+static void test_template_cpp(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) {
+    t.test(name, [&tmpl, &vars, &expect](testing & t) {
+        jinja::lexer lexer;
+        auto lexer_res = lexer.tokenize(tmpl);
+
+        jinja::program ast = jinja::parse_from_tokens(lexer_res);
+
+        jinja::context ctx(tmpl);
+        jinja::global_from_json(ctx, vars, true);
+
+        jinja::runtime runtime(ctx);
+
+        try {
+            const jinja::value results = runtime.execute(ast);
+            auto parts = runtime.gather_string_parts(results);
+
+            std::string rendered;
+            for (const auto & part : parts->as_string().parts) {
+                rendered += part.val;
+            }
+
+            if (!t.assert_true("Template render mismatch", expect == rendered)) {
+                t.log("Template: " + json(tmpl).dump());
+                t.log("Expected: " + json(expect).dump());
+                t.log("Actual  : " + json(rendered).dump());
+            }
+        } catch (const jinja::not_implemented_exception & e) {
+            // TODO @ngxson : remove this when the test framework supports skipping tests
+            t.log("Skipped: " + std::string(e.what()));
+        }
+    });
+}
+
+// keep this in-sync with https://github.com/huggingface/transformers/blob/main/src/transformers/utils/chat_template_utils.py
+// note: we use SandboxedEnvironment instead of ImmutableSandboxedEnvironment to allow usage of in-place array methods like append() and pop()
+static std::string py_script = R"(
+import jinja2
+import jinja2.ext as jinja2_ext
+import json
+import sys
+from datetime import datetime
+from jinja2.sandbox import SandboxedEnvironment
+
+tmpl = json.loads(sys.argv[1])
+vars_json = json.loads(sys.argv[2])
+
+env = SandboxedEnvironment(
+    trim_blocks=True,
+    lstrip_blocks=True,
+    extensions=[jinja2_ext.loopcontrols],
+)
+
+def raise_exception(message):
+    raise jinja2.exceptions.TemplateError(message)
+
+env.filters["tojson"] = lambda x, ensure_ascii=False, indent=None, separators=None, sort_keys=False: json.dumps(x, ensure_ascii=ensure_ascii, indent=indent, separators=separators, sort_keys=sort_keys)
+env.globals["strftime_now"] = lambda format: datetime.now().strftime(format)
+env.globals["raise_exception"] = raise_exception
+
+template = env.from_string(tmpl)
+result = template.render(**vars_json)
+print(result, end='')
+)";
+
+static void test_template_py(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) {
+    t.test(name, [&tmpl, &vars, &expect](testing & t) {
+        // Prepare arguments
+        std::string tmpl_json = json(tmpl).dump();
+        std::string vars_json = vars.dump();
+
+#ifdef _WIN32
+        const char * python_executable = "python.exe";
+#else
+        const char * python_executable = "python3";
+#endif
+
+        const char * command_line[] = {python_executable, "-c", py_script.c_str(), tmpl_json.c_str(), vars_json.c_str(), NULL};
+
+        struct subprocess_s subprocess;
+        int options = subprocess_option_combined_stdout_stderr
+                    | subprocess_option_no_window
+                    | subprocess_option_inherit_environment
+                    | subprocess_option_search_user_path;
+        int result = subprocess_create(command_line, options, &subprocess);
+
+        if (result != 0) {
+            t.log("Failed to create subprocess, error code: " + std::to_string(result));
+            t.assert_true("subprocess creation", false);
+            return;
+        }
+
+        // Read output
+        std::string output;
+        char buffer[1024];
+        FILE * p_stdout = subprocess_stdout(&subprocess);
+        while (fgets(buffer, sizeof(buffer), p_stdout)) {
+            output += buffer;
+        }
+
+        int process_return;
+        subprocess_join(&subprocess, &process_return);
+        subprocess_destroy(&subprocess);
+
+        if (process_return != 0) {
+            t.log("Python script failed with exit code: " + std::to_string(process_return));
+            t.log("Output: " + output);
+            t.assert_true("python execution", false);
+            return;
+        }
+
+        if (!t.assert_true("Template render mismatch", expect == output)) {
+            t.log("Template: " + json(tmpl).dump());
+            t.log("Expected: " + json(expect).dump());
+            t.log("Python  : " + json(output).dump());
+        }
+    });
+}
+
+static void test_template(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) {
+    if (g_python_mode) {
+        test_template_py(t, name, tmpl, vars, expect);
+    } else {
+        test_template_cpp(t, name, tmpl, vars, expect);
+    }
+}
+
+//
+// fuzz tests to ensure no crashes occur on malformed inputs
+//
+
+constexpr int JINJA_FUZZ_ITERATIONS = 100;
+
+// Helper to generate random string
+static std::string random_string(std::mt19937 & rng, size_t max_len) {
+    static const char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
+    std::uniform_int_distribution<size_t> len_dist(0, max_len);
+    std::uniform_int_distribution<size_t> char_dist(0, sizeof(charset) - 2);
+    size_t len = len_dist(rng);
+    std::string result;
+    result.reserve(len);
+    for (size_t i = 0; i < len; ++i) {
+        result += charset[char_dist(rng)];
+    }
+    return result;
+}
+
+// Helper to execute a fuzz test case - returns true if no crash occurred
+static bool fuzz_test_template(const std::string & tmpl, const json & vars) {
+    try {
+        // printf("Fuzz testing template: %s\n", tmpl.c_str());
+        jinja::lexer lexer;
+        auto lexer_res = lexer.tokenize(tmpl);
+        jinja::program ast = jinja::parse_from_tokens(lexer_res);
+        jinja::context ctx(tmpl);
+        jinja::global_from_json(ctx, vars, true);
+        jinja::runtime runtime(ctx);
+        const jinja::value results = runtime.execute(ast);
+        runtime.gather_string_parts(results);
+        return true; // success
+    } catch (const std::exception &) {
+        return true; // exception is acceptable, not a crash
+    } catch (...) {
+        return true; // any exception is acceptable, not a crash
+    }
+}
+
+static void test_fuzzing(testing & t) {
+    const int num_iterations = JINJA_FUZZ_ITERATIONS;
+    const unsigned int seed = 42; // fixed seed for reproducibility
+    std::mt19937 rng(seed);
+
+    // Distribution helpers
+    std::uniform_int_distribution<int> choice_dist(0, 100);
+    std::uniform_int_distribution<int> int_dist(-1000, 1000);
+    std::uniform_int_distribution<size_t> idx_dist(0, 1000);
+
+    // Template fragments for fuzzing
+    const std::vector<std::string> var_names = {
+        "x", "y", "z", "arr", "obj", "items", "foo", "bar", "undefined_var",
+        "none", "true", "false", "None", "True", "False"
+    };
+    const std::vector<std::string> filters = {
+        "length", "first", "last", "reverse", "sort", "unique", "join", "upper", "lower",
+        "trim", "default", "tojson", "string", "int", "float", "abs", "list", "dictsort"
+    };
+    const std::vector<std::string> builtins = {
+        "range", "len", "dict", "list", "join", "str", "int", "float", "namespace"
+    };
+
+    t.test("out of bound array access", [&](testing & t) {
+        for (int i = 0; i < num_iterations; ++i) {
+            int idx = int_dist(rng);
+            std::string tmpl = "{{ arr[" + std::to_string(idx) + "] }}";
+            json vars = {{"arr", json::array({1, 2, 3})}};
+            t.assert_true("should not crash", fuzz_test_template(tmpl, vars));
+        }
+    });
+
+    t.test("non-existing variables", [&](testing & t) {
+        for (int i = 0; i < num_iterations; ++i) {
+            std::string var = random_string(rng, 20);
+            std::string tmpl = "{{ " + var + " }}";
+            json vars = json::object(); // empty context
+            t.assert_true("should not crash", fuzz_test_template(tmpl, vars));
+        }
+    });
+
+    t.test("non-existing nested attributes", [&](testing & t) {
+        for (int i = 0; i < num_iterations; ++i) {
+            std::string var1 = var_names[choice_dist(rng) % var_names.size()];
+            std::string var2 = random_string(rng, 10);
+            std::string var3 = random_string(rng, 10);
+            std::string tmpl = "{{ " + var1 + "." + var2 + "." + var3 + " }}";
+            json vars = {{var1, {{"other", 123}}}};
+            t.assert_true("should not crash", fuzz_test_template(tmpl, vars));
+        }
+    });
+
+    t.test("invalid filter arguments", [&](testing & t) {
+        for (int i = 0; i < num_iterations; ++i) {
+            std::string filter = filters[choice_dist(rng) % filters.size()];
+            int val = int_dist(rng);
+            std::string tmpl = "{{ " + std::to_string(val) + " | " + filter + " }}";
+            json vars = json::object();
+            t.assert_true("should not crash", fuzz_test_template(tmpl, vars));
+        }
+    });
+
+    t.test("chained filters on various types", [&](testing & t) {
+        for (int i = 0; i < num_iterations; ++i) {
+            std::string f1 = filters[choice_dist(rng) % filters.size()];
+            std::string f2 = filters[choice_dist(rng) % filters.size()];
+            std::string var = var_names[choice_dist(rng) % var_names.size()];
+            std::string tmpl = "{{ " + var + " | " + f1 + " | " + f2 + " }}";
+            json vars = {
+                {"x", 42},
+                {"y", "hello"},
+                {"arr", json::array({1, 2, 3})},
+                {"obj", {{"a", 1}, {"b", 2}}},
+                {"items", json::array({"a", "b", "c"})}
+            };
+            t.assert_true("should not crash", fuzz_test_template(tmpl, vars));
+        }
+    });
+
+    t.test("invalid builtin calls", [&](testing & t) {
+        for (int i = 0; i < num_iterations; ++i) {
+            std::string builtin = builtins[choice_dist(rng) % builtins.size()];
+            std::string arg;
+            int arg_type = choice_dist(rng) % 4;
+            switch (arg_type) {
+                case 0: arg = "\"not a number\""; break;
+                case 1: arg = "none"; break;
+                case 2: arg = std::to_string(int_dist(rng)); break;
+                case 3: arg = "[]"; break;
+            }
+            std::string tmpl = "{{ " + builtin + "(" + arg + ") }}";
+            json vars = json::object();
+            t.assert_true("should not crash", fuzz_test_template(tmpl, vars));
+        }
+    });
+
+    t.test("macro edge cases", [&](testing & t) {
+        // Macro with no args called with args
+        t.assert_true("macro no args with args", fuzz_test_template(
+            "{% macro foo() %}hello{% endmacro %}{{ foo(1, 2, 3) }}",
+            json::object()
+        ));
+
+        // Macro with args called with no args
+        t.assert_true("macro with args no args", fuzz_test_template(
+            "{% macro foo(a, b, c) %}{{ a }}{{ b }}{{ c }}{% endmacro %}{{ foo() }}",
+            json::object()
+        ));
+
+        // Recursive macro reference
+        t.assert_true("recursive macro", fuzz_test_template(
+            "{% macro foo(n) %}{% if n > 0 %}{{ foo(n - 1) }}{% endif %}{% endmacro %}{{ foo(5) }}",
+            json::object()
+        ));
+
+        // Nested macro definitions
+        for (int i = 0; i < num_iterations / 10; ++i) {
+            std::string tmpl = "{% macro outer() %}{% macro inner() %}x{% endmacro %}{{ inner() }}{% endmacro %}{{ outer() }}";
+            t.assert_true("nested macro", fuzz_test_template(tmpl, json::object()));
+        }
+    });
+
+    t.test("empty and none operations", [&](testing & t) {
+        const std::vector<std::string> empty_tests = {
+            "{{ \"\" | first }}",
+            "{{ \"\" | last }}",
+            "{{ [] | first }}",
+            "{{ [] | last }}",
+            "{{ none.attr }}",
+            "{{ none | length }}",
+            "{{ none | default('fallback') }}",
+            "{{ {} | first }}",
+            "{{ {} | dictsort }}",
+        };
+        for (const auto & tmpl : empty_tests) {
+            t.assert_true("empty/none: " + tmpl, fuzz_test_template(tmpl, json::object()));
+        }
+    });
+
+    t.test("arithmetic edge cases", [&](testing & t) {
+        const std::vector<std::string> arith_tests = {
+            "{{ 1 / 0 }}",
+            "{{ 1 // 0 }}",
+            "{{ 1 % 0 }}",
+            "{{ 999999999999999999 * 999999999999999999 }}",
+            "{{ -999999999999999999 - 999999999999999999 }}",
+            "{{ 1.0 / 0.0 }}",
+            "{{ 0.0 / 0.0 }}",
+        };
+        for (const auto & tmpl : arith_tests) {
+            t.assert_true("arith: " + tmpl, fuzz_test_template(tmpl, json::object()));
+        }
+    });
+
+    t.test("deeply nested structures", [&](testing & t) {
+        // Deeply nested loops
+        for (int depth = 1; depth <= 10; ++depth) {
+            std::string tmpl;
+            for (int d = 0; d < depth; ++d) {
+                tmpl += "{% for i" + std::to_string(d) + " in arr %}";
+            }
+            tmpl += "x";
+            for (int d = 0; d < depth; ++d) {
+                tmpl += "{% endfor %}";
+            }
+            json vars = {{"arr", json::array({1, 2})}};
+            t.assert_true("nested loops depth " + std::to_string(depth), fuzz_test_template(tmpl, vars));
+        }
+
+        // Deeply nested conditionals
+        for (int depth = 1; depth <= 10; ++depth) {
+            std::string tmpl;
+            for (int d = 0; d < depth; ++d) {
+                tmpl += "{% if true %}";
+            }
+            tmpl += "x";
+            for (int d = 0; d < depth; ++d) {
+                tmpl += "{% endif %}";
+            }
+            t.assert_true("nested ifs depth " + std::to_string(depth), fuzz_test_template(tmpl, json::object()));
+        }
+    });
+
+    t.test("special characters in strings", [&](testing & t) {
+        const std::vector<std::string> special_tests = {
+            "{{ \"}{%\" }}",
+            "{{ \"}}{{\" }}",
+            "{{ \"{%%}\" }}",
+            "{{ \"\\n\\t\\r\" }}",
+            "{{ \"'\\\"'\" }}",
+            "{{ \"hello\\x00world\" }}",
+        };
+        for (const auto & tmpl : special_tests) {
+            t.assert_true("special: " + tmpl, fuzz_test_template(tmpl, json::object()));
+        }
+    });
+
+    t.test("random template generation", [&](testing & t) {
+        const std::vector<std::string> fragments = {
+            "{{ x }}", "{{ y }}", "{{ arr }}", "{{ obj }}",
+            "{% if true %}a{% endif %}",
+            "{% if false %}b{% else %}c{% endif %}",
+            "{% for i in arr %}{{ i }}{% endfor %}",
+            "{{ x | length }}", "{{ x | first }}", "{{ x | default(0) }}",
+            "{{ x + y }}", "{{ x - y }}", "{{ x * y }}",
+            "{{ x == y }}", "{{ x != y }}", "{{ x > y }}",
+            "{{ range(3) }}", "{{ \"hello\" | upper }}",
+            "text", " ", "\n",
+        };
+
+        for (int i = 0; i < num_iterations; ++i) {
+            std::string tmpl;
+            int num_frags = choice_dist(rng) % 10 + 1;
+            for (int f = 0; f < num_frags; ++f) {
+                tmpl += fragments[choice_dist(rng) % fragments.size()];
+            }
+            json vars = {
+                {"x", int_dist(rng)},
+                {"y", int_dist(rng)},
+                {"arr", json::array({1, 2, 3})},
+                {"obj", {{"a", 1}, {"b", 2}}}
+            };
+            t.assert_true("random template #" + std::to_string(i), fuzz_test_template(tmpl, vars));
+        }
+    });
+
+    t.test("malformed templates (should error, not crash)", [&](testing & t) {
+        const std::vector<std::string> malformed = {
+            "{{ x",
+            "{% if %}",
+            "{% for %}",
+            "{% for x in %}",
+            "{% endfor %}",
+            "{% endif %}",
+            "{{ | filter }}",
+            "{% if x %}", // unclosed
+            "{% for i in x %}", // unclosed
+            "{{ x | }}",
+            "{% macro %}{% endmacro %}",
+            "{{{{",
+            "}}}}",
+            "{%%}",
+            "{% set %}",
+            "{% set x %}",
+        };
+        for (const auto & tmpl : malformed) {
+            t.assert_true("malformed: " + tmpl, fuzz_test_template(tmpl, json::object()));
+        }
+    });
+
+    t.test("type coercion edge cases", [&](testing & t) {
+        for (int i = 0; i < num_iterations; ++i) {
+            int op_choice = choice_dist(rng) % 6;
+            std::string op;
+            switch (op_choice) {
+                case 0: op = "+"; break;
+                case 1: op = "-"; break;
+                case 2: op = "*"; break;
+                case 3: op = "/"; break;
+                case 4: op = "=="; break;
+                case 5: op = "~"; break; // string concat
+            }
+
+            std::string left_var = var_names[choice_dist(rng) % var_names.size()];
+            std::string right_var = var_names[choice_dist(rng) % var_names.size()];
+            std::string tmpl = "{{ " + left_var + " " + op + " " + right_var + " }}";
+
+            json vars = {
+                {"x", 42},
+                {"y", "hello"},
+                {"z", 3.14},
+                {"arr", json::array({1, 2, 3})},
+                {"obj", {{"a", 1}}},
+                {"items", json::array()},
+                {"foo", nullptr},
+                {"bar", true}
+            };
+            t.assert_true("type coercion: " + tmpl, fuzz_test_template(tmpl, vars));
+        }
+    });
+
+    t.test("fuzz builtin functions", [&](testing & t) {
+        // pair of (type_name, builtin_name)
+        std::vector<std::pair<std::string, std::string>> builtins;
+        auto add_fns = [&](std::string type_name, const jinja::func_builtins & added) {
+            for (const auto & it : added) {
+                builtins.push_back({type_name, it.first});
+            }
+        };
+        add_fns("global", jinja::global_builtins());
+        add_fns("int",    jinja::value_int_t(0).get_builtins());
+        add_fns("float",  jinja::value_float_t(0.0f).get_builtins());
+        add_fns("string", jinja::value_string_t().get_builtins());
+        add_fns("array",  jinja::value_array_t().get_builtins());
+        add_fns("object", jinja::value_object_t().get_builtins());
+
+        const int max_args = 5;
+        const std::vector<std::string> kwarg_names = {
+            "base", "attribute", "default", "reverse", "case_sensitive", "by", "safe", "chars", "separators", "sort_keys", "indent", "ensure_ascii",
+        };
+
+        // Generate random argument values
+        auto gen_random_arg = [&]() -> std::string {
+            int type = choice_dist(rng) % 8;
+            switch (type) {
+                case 0: return std::to_string(int_dist(rng));           // int
+                case 1: return std::to_string(int_dist(rng)) + ".5";    // float
+                case 2: return "\"" + random_string(rng, 10) + "\"";    // string
+                case 3: return "true";                                   // bool true
+                case 4: return "false";                                  // bool false
+                case 5: return "none";                                   // none
+                case 6: return "[1, 2, 3]";                              // array
+                case 7: return "{\"a\": 1}";                             // object
+                default: return "0";
+            }
+        };
+
+        for (int i = 0; i < num_iterations; ++i) {
+            // Pick a random builtin
+            auto & [type_name, fn_name] = builtins[choice_dist(rng) % builtins.size()];
+
+            // Generate random number of args
+            int num_args = choice_dist(rng) % (max_args + 1);
+            std::string args_str;
+            for (int a = 0; a < num_args; ++a) {
+                if (a > 0) args_str += ", ";
+                // Sometimes use keyword args
+                if (choice_dist(rng) % 3 == 0 && !kwarg_names.empty()) {
+                    std::string kwarg = kwarg_names[choice_dist(rng) % kwarg_names.size()];
+                    args_str += kwarg + "=" + gen_random_arg();
+                } else {
+                    args_str += gen_random_arg();
+                }
+            }
+
+            std::string tmpl;
+            if (type_name == "global") {
+                // Global function call
+                tmpl = "{{ " + fn_name + "(" + args_str + ") }}";
+            } else {
+                // Method call on a value
+                std::string base_val;
+                if (type_name == "int") {
+                    base_val = std::to_string(int_dist(rng));
+                } else if (type_name == "float") {
+                    base_val = std::to_string(int_dist(rng)) + ".5";
+                } else if (type_name == "string") {
+                    base_val = "\"test_string\"";
+                } else if (type_name == "array") {
+                    base_val = "[1, 2, 3, \"a\", \"b\"]";
+                } else if (type_name == "object") {
+                    base_val = "{\"x\": 1, \"y\": 2}";
+                } else {
+                    base_val = "x";
+                }
+                tmpl = "{{ " + base_val + "." + fn_name + "(" + args_str + ") }}";
+            }
+
+            json vars = {
+                {"x", 42},
+                {"y", "hello"},
+                {"arr", json::array({1, 2, 3})},
+                {"obj", {{"a", 1}, {"b", 2}}}
+            };
+
+            t.assert_true("builtin " + type_name + "." + fn_name + " #" + std::to_string(i), fuzz_test_template(tmpl, vars));
+        }
+    });
+}
diff --git a/llama.cpp/tests/test-json-partial.cpp b/llama.cpp/tests/test-json-partial.cpp
new file mode 100644
index 0000000..39da927
--- /dev/null
+++ b/llama.cpp/tests/test-json-partial.cpp
@@ -0,0 +1,287 @@
+#include "common.h"
+#include "json-partial.h"
+#include <exception>
+#include <iostream>
+#include <stdexcept>
+
+template <class T> static void assert_equals(const T & expected, const T & actual) {
+  if (expected != actual) {
+      std::cerr << "Expected: " << expected << std::endl;
+      std::cerr << "Actual: " << actual << std::endl;
+      std::cerr << std::flush;
+      throw std::runtime_error("Test failed");
+  }
+}
+
+static void test_json_healing() {
+  auto parse = [](const std::string & str) {
+      std::cerr << "# Parsing: " << str << '\n';
+      std::string::const_iterator it = str.begin();
+      const auto end = str.end();
+      common_json out;
+      std::string healing_marker = "$llama.cpp.json$";
+      if (common_json_parse(it, end, healing_marker, out)) {
+          auto dump = out.json.dump();
+          std::cerr << "Parsed: " << dump << '\n';
+          std::cerr << "Magic: " << out.healing_marker.json_dump_marker << '\n';
+          std::string result;
+          if (!out.healing_marker.json_dump_marker.empty()) {
+              auto i = dump.find(out.healing_marker.json_dump_marker);
+              if (i == std::string::npos) {
+                  throw std::runtime_error("Failed to find magic in dump " + dump + " (magic: " + out.healing_marker.json_dump_marker + ")");
+              }
+              result = dump.substr(0, i);
+          } else {
+            result = dump;
+          }
+          std::cerr << "Result: " << result << '\n';
+          if (string_starts_with(str, result)) {
+            std::cerr << "Failure!\n";
+          }
+        //   return dump;
+      } else {
+        throw std::runtime_error("Failed to parse: " + str);
+      }
+
+  };
+  auto parse_all = [&](const std::string & str) {
+      for (size_t i = 1; i < str.size(); i++) {
+          parse(str.substr(0, i));
+      }
+  };
+  parse_all("{\"a\": \"b\"}");
+  parse_all("{\"hey\": 1, \"ho\\\"ha\": [1]}");
+
+  parse_all("[{\"a\": \"b\"}]");
+
+  auto test = [&](const std::vector<std::string> & inputs, const std::string & expected, const std::string & expected_marker) {
+      for (const auto & input : inputs) {
+        common_json out;
+        assert_equals(true, common_json_parse(input, "$foo", out));
+        assert_equals<std::string>(expected, out.json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true));
+        assert_equals<std::string>(expected_marker, out.healing_marker.json_dump_marker);
+      }
+  };
+  // No healing needed:
+  test(
+    {
+      R"([{"a":"b"}, "y"])",
+    },
+    R"([{"a":"b"},"y"])",
+    ""
+  );
+  // Partial literals can't be healed:
+  test(
+    {
+      R"([1)",
+      R"([tru)",
+      R"([n)",
+      R"([nul)",
+      R"([23.2)",
+    },
+    R"(["$foo"])",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"({"a": 1)",
+      R"({"a": tru)",
+      R"({"a": n)",
+      R"({"a": nul)",
+      R"({"a": 23.2)",
+    },
+    R"({"a":"$foo"})",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"({)",
+    },
+    R"({"$foo":1})",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"([)",
+    },
+    R"(["$foo"])",
+    R"("$foo)"
+  );
+  // Healing right after a full literal
+  test(
+    {
+      R"(1 )",
+    },
+    R"(1)",
+    ""
+  );
+  test(
+    {
+      R"(true)",
+      R"(true )",
+    },
+    R"(true)",
+    ""
+  );
+  test(
+    {
+      R"(null)",
+      R"(null )",
+    },
+    R"(null)",
+    ""
+  );
+  test(
+    {
+      R"([1 )",
+    },
+    R"([1,"$foo"])",
+    R"(,"$foo)"
+  );
+  test(
+    {
+      R"([{})",
+      R"([{} )",
+    },
+    R"([{},"$foo"])",
+    R"(,"$foo)"
+  );
+  test(
+    {
+      R"([true)",
+    },
+    // TODO: detect the true/false/null literal was complete
+    R"(["$foo"])",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"([true )",
+    },
+    R"([true,"$foo"])",
+    R"(,"$foo)"
+  );
+  test(
+    {
+      R"([true,)",
+    },
+    R"([true,"$foo"])",
+    R"("$foo)"
+  );
+  // Test nesting
+  test(
+    {
+      R"([{"a": [{"b": [{)",
+    },
+    R"([{"a":[{"b":[{"$foo":1}]}]}])",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"([{"a": [{"b": [)",
+    },
+    R"([{"a":[{"b":["$foo"]}]}])",
+    R"("$foo)"
+  );
+
+  test(
+    {
+      R"([{"a": "b"})",
+      R"([{"a": "b"} )",
+    },
+    R"([{"a":"b"},"$foo"])",
+    R"(,"$foo)"
+  );
+  test(
+    {
+      R"([{"a": "b"},)",
+      R"([{"a": "b"}, )",
+    },
+    R"([{"a":"b"},"$foo"])",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"({ "code)",
+    },
+    R"({"code$foo":1})",
+    R"($foo)"
+  );
+  test(
+    {
+      R"({ "code\)",
+    },
+    R"({"code\\$foo":1})",
+    R"(\$foo)"
+  );
+  test(
+    {
+      R"({ "code")",
+    },
+    R"({"code":"$foo"})",
+    R"(:"$foo)"
+  );
+  test(
+    {
+      R"({ "key")",
+    },
+    R"({"key":"$foo"})",
+    R"(:"$foo)"
+  );
+  // Test unicode escape sequences
+  test(
+    {
+      R"({"a":"\u)",
+    },
+    R"({"a":"\u0000$foo"})",
+    R"(0000$foo)"
+  );
+  test(
+    {
+      R"({"a":"\u00)",
+    },
+    R"({"a":"\u0000$foo"})",
+    R"(00$foo)"
+  );
+  test(
+    {
+      R"({"a":"\ud300)",
+    },
+    R"({"a":"\ud300$foo"})",
+    R"($foo)"
+  );
+  test(
+    {
+      R"({"a":"\ud800)",
+    },
+    R"({"a":"\ud800\udc00$foo"})",
+    R"(\udc00$foo)"
+  );
+  test(
+    {
+      R"({"a":"\ud800\)",
+    },
+    R"({"a":"\ud800\udc00$foo"})",
+    R"(udc00$foo)"
+  );
+  test(
+    {
+      R"({"a":"\ud800\u)",
+    },
+    R"({"a":"\ud800\udc00$foo"})",
+    R"(dc00$foo)"
+  );
+  test(
+    {
+      R"({"a":"\ud800\udc00)",
+    },
+    R"({"a":"\ud800\udc00$foo"})",
+    R"($foo)"
+  );
+}
+
+int main() {
+    test_json_healing();
+    std::cerr << "All tests passed.\n";
+    return 0;
+}
diff --git a/llama.cpp/tests/test-json-schema-to-grammar.cpp b/llama.cpp/tests/test-json-schema-to-grammar.cpp
new file mode 100755
index 0000000..a8e9ff3
--- /dev/null
+++ b/llama.cpp/tests/test-json-schema-to-grammar.cpp
@@ -0,0 +1,1490 @@
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include "json-schema-to-grammar.h"
+
+#include "../src/llama-grammar.h"
+
+#include <nlohmann/json.hpp>
+
+#include <cassert>
+#include <fstream>
+#include <sstream>
+#include <regex>
+
+static std::string trim(const std::string & source) {
+    std::string s(source);
+    s.erase(0,s.find_first_not_of(" \n\r\t"));
+    s.erase(s.find_last_not_of(" \n\r\t")+1);
+    return std::regex_replace(s, std::regex("(^|\n)[ \t]+"), "$1");
+}
+
+enum TestCaseStatus {
+    SUCCESS,
+    FAILURE
+};
+
+struct TestCase {
+    TestCaseStatus expected_status;
+    std::string name;
+    std::string schema;
+    std::string expected_grammar;
+
+    void _print_failure_header() const {
+        fprintf(stderr, "#\n# Test '%s' failed.\n#\n%s\n", name.c_str(), schema.c_str());
+    }
+    void verify(const std::string & actual_grammar) const {
+        if (trim(actual_grammar) != trim(expected_grammar)) {
+        _print_failure_header();
+        fprintf(stderr, "# EXPECTED:\n%s\n# ACTUAL:\n%s\n", expected_grammar.c_str(), actual_grammar.c_str());
+        assert(false);
+        }
+    }
+    void verify_expectation_parseable() const {
+        try {
+            llama_grammar_parser state;
+            state.parse(expected_grammar.c_str());
+            if (state.symbol_ids.find("root") == state.symbol_ids.end()) {
+                throw std::runtime_error("Grammar failed to parse:\n" + expected_grammar);
+            }
+        } catch (const std::runtime_error & ex) {
+            _print_failure_header();
+            fprintf(stderr, "# GRAMMAR ERROR: %s\n", ex.what());
+            assert(false);
+        }
+    }
+    void verify_status(TestCaseStatus status) const {
+        if (status != expected_status) {
+            _print_failure_header();
+            fprintf(stderr, "# EXPECTED STATUS: %s\n", expected_status == SUCCESS ? "SUCCESS" : "FAILURE");
+            fprintf(stderr, "# ACTUAL STATUS: %s\n", status == SUCCESS ? "SUCCESS" : "FAILURE");
+            assert(false);
+        }
+    }
+};
+
+static void write(const std::string & file, const std::string & content) {
+    std::ofstream f;
+    f.open(file.c_str());
+    f << content.c_str();
+    f.close();
+}
+
+static std::string read(const std::string & file) {
+    std::ostringstream actuals;
+    actuals << std::ifstream(file.c_str()).rdbuf();
+    return actuals.str();
+}
+
+static void test_all(const std::string & lang, std::function<void(const TestCase &)> runner) {
+    fprintf(stderr, "#\n# Testing JSON schema conversion (%s)\n#\n", lang.c_str());
+    auto test = [&](const TestCase & tc) {
+        fprintf(stderr, "- %s%s\n", tc.name.c_str(), tc.expected_status == FAILURE ? " (failure expected)" : "");
+        runner(tc);
+    };
+
+    test({
+        SUCCESS,
+        "min 0",
+        R"""({
+            "type": "integer",
+            "minimum": 0
+        })""",
+        R"""(
+            root ::= ([0] | [1-9] [0-9]{0,15}) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 1",
+        R"""({
+            "type": "integer",
+            "minimum": 1
+        })""",
+        R"""(
+            root ::= ([1-9] [0-9]{0,15}) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 3",
+        R"""({
+            "type": "integer",
+            "minimum": 3
+        })""",
+        R"""(
+            root ::= ([1-2] [0-9]{1,15} | [3-9] [0-9]{0,15}) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 9",
+        R"""({
+            "type": "integer",
+            "minimum": 9
+        })""",
+        R"""(
+            root ::= ([1-8] [0-9]{1,15} | [9] [0-9]{0,15}) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 10",
+        R"""({
+            "type": "integer",
+            "minimum": 10
+        })""",
+        R"""(
+            root ::= ([1] ([0-9]{1,15}) | [2-9] [0-9]{1,15}) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 25",
+        R"""({
+            "type": "integer",
+            "minimum": 25
+        })""",
+        R"""(
+            root ::= ([1] [0-9]{2,15} | [2] ([0-4] [0-9]{1,14} | [5-9] [0-9]{0,14}) | [3-9] [0-9]{1,15}) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "max 30",
+        R"""({
+            "type": "integer",
+            "maximum": 30
+        })""",
+        R"""(
+            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-2] [0-9] | [3] "0")) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min -5",
+        R"""({
+            "type": "integer",
+            "minimum": -5
+        })""",
+        R"""(
+            root ::= ("-" ([0-5]) | [0] | [1-9] [0-9]{0,15}) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min -123",
+        R"""({
+            "type": "integer",
+            "minimum": -123
+        })""",
+        R"""(
+            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0] | [1-9] [0-9]{0,15}) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "max -5",
+        R"""({
+            "type": "integer",
+            "maximum": -5
+        })""",
+        R"""(
+            root ::= ("-" ([0-4] [0-9]{1,15} | [5-9] [0-9]{0,15})) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "max 1",
+        R"""({
+            "type": "integer",
+            "maximum": 1
+        })""",
+        R"""(
+            root ::= ("-" [1-9] [0-9]{0,15} | [0-1]) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "max 100",
+        R"""({
+            "type": "integer",
+            "maximum": 100
+        })""",
+        R"""(
+            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-8] [0-9] | [9] [0-9]) | "100") space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 0 max 23",
+        R"""({
+            "type": "integer",
+            "minimum": 0,
+            "maximum": 23
+        })""",
+        R"""(
+            root ::= ([0-9] | ([1] [0-9] | [2] [0-3])) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 15 max 300",
+        R"""({
+            "type": "integer",
+            "minimum": 15,
+            "maximum": 300
+        })""",
+        R"""(
+            root ::= (([1] ([5-9]) | [2-9] [0-9]) | ([1-2] [0-9]{2} | [3] "00")) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min 5 max 30",
+        R"""({
+            "type": "integer",
+            "minimum": 5,
+            "maximum": 30
+        })""",
+        R"""(
+            root ::= ([5-9] | ([1-2] [0-9] | [3] "0")) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min -123 max 42",
+        R"""({
+            "type": "integer",
+            "minimum": -123,
+            "maximum": 42
+        })""",
+        R"""(
+            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0-9] | ([1-3] [0-9] | [4] [0-2])) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min -10 max 10",
+        R"""({
+            "type": "integer",
+            "minimum": -10,
+            "maximum": 10
+        })""",
+        R"""(
+            root ::= ("-" ([0-9] | "10") | [0-9] | "10") space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        FAILURE,
+        "unknown type",
+        R"""({
+            "type": "kaboom"
+        })""",
+        ""
+    });
+
+    test({
+        FAILURE,
+        "invalid type",
+        R"""({
+            "type": 123
+        })""",
+        ""
+    });
+
+    test({
+        SUCCESS,
+        "empty schema (object)",
+        "{}",
+        R"""(
+            array ::= "[" space ( value ("," space value)* )? "]" space
+            boolean ::= ("true" | "false") space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            null ::= "null" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+            root ::= object
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+            value ::= object | array | string | number | boolean | null
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "exotic formats",
+        R"""({
+            "items": [
+                { "format": "date" },
+                { "format": "uuid" },
+                { "format": "time" },
+                { "format": "date-time" }
+            ]
+        })""",
+        R"""(
+            date ::= [0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )
+            date-string ::= "\"" date "\"" space
+            date-time ::= date "T" time
+            date-time-string ::= "\"" date-time "\"" space
+            root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 "]" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
+            time-string ::= "\"" time "\"" space
+            tuple-0 ::= date-string
+            tuple-2 ::= time-string
+            tuple-3 ::= date-time-string
+            uuid ::= "\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string",
+        R"""({
+            "type": "string"
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char* "\"" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string w/ min length 1",
+        R"""({
+            "type": "string",
+            "minLength": 1
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char+ "\"" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string w/ min length 3",
+        R"""({
+            "type": "string",
+            "minLength": 3
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char{3,} "\"" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string w/ max length",
+        R"""({
+            "type": "string",
+            "maxLength": 3
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char{0,3} "\"" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string w/ min & max length",
+        R"""({
+            "type": "string",
+            "minLength": 1,
+            "maxLength": 4
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char{1,4} "\"" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "boolean",
+        R"""({
+            "type": "boolean"
+        })""",
+        R"""(
+            root ::= ("true" | "false") space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "integer",
+        R"""({
+            "type": "integer"
+        })""",
+        R"""(
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= ("-"? integral-part) space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string const",
+        R"""({
+            "const": "foo"
+        })""",
+        R"""(
+            root ::= "\"foo\"" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "non-string const",
+        R"""({
+            "const": 123
+        })""",
+        R"""(
+            root ::= "123" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "non-string enum",
+        R"""({
+            "enum": ["red", "amber", "green", null, 42, ["foo"]]
+        })""",
+        R"""(
+            root ::= ("\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]") space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "string array",
+        R"""({
+            "type": "array",
+            "prefixItems": { "type": "string" }
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "[" space (string ("," space string)*)? "]" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "nullable string array",
+        R"""({
+            "type": ["array", "null"],
+            "prefixItems": { "type": "string" }
+        })""",
+        R"""(
+            alternative-0 ::= "[" space (string ("," space string)*)? "]" space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            null ::= "null" space
+            root ::= alternative-0 | null
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "tuple1",
+        R"""({
+            "prefixItems": [{ "type": "string" }]
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "[" space string "]" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "tuple2",
+        R"""({
+            "prefixItems": [{ "type": "string" }, { "type": "number" }]
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "[" space string "," space number "]" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "number",
+        R"""({
+            "type": "number"
+        })""",
+        R"""(
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "minItems",
+        R"""({
+            "items": {
+                "type": "boolean"
+            },
+            "minItems": 2
+        })""",
+        R"""(
+            boolean ::= ("true" | "false") space
+            root ::= "[" space boolean ("," space boolean)+ "]" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "maxItems 0",
+        R"""({
+            "items": {
+                "type": "boolean"
+            },
+            "maxItems": 0
+        })""",
+        R"""(
+            boolean ::= ("true" | "false") space
+            root ::= "[" space  "]" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "maxItems 1",
+        R"""({
+            "items": {
+                "type": "boolean"
+            },
+            "maxItems": 1
+        })""",
+        R"""(
+            boolean ::= ("true" | "false") space
+            root ::= "[" space boolean? "]" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "maxItems 2",
+        R"""({
+            "items": {
+                "type": "boolean"
+            },
+            "maxItems": 2
+        })""",
+        R"""(
+            boolean ::= ("true" | "false") space
+            root ::= "[" space (boolean ("," space boolean)?)? "]" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min + maxItems",
+        R"""({
+            "items": {
+                "type": ["number", "integer"]
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
+        R"""(
+            decimal-part ::= [0-9]{1,16}
+            integer ::= ("-"? integral-part) space
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            item ::= number | integer
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "[" space item ("," space item){2,4} "]" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min + max items with min + max values across zero",
+        R"""({
+            "items": {
+                "type": "integer",
+                "minimum": -12,
+                "maximum": 207
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
+        R"""(
+            item ::= ("-" ([0-9] | "1" [0-2]) | [0-9] | ([1-8] [0-9] | [9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
+            root ::= "[" space item ("," space item){2,4} "]" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "min + max items with min + max values",
+        R"""({
+            "items": {
+                "type": "integer",
+                "minimum": 12,
+                "maximum": 207
+            },
+            "minItems": 3,
+            "maxItems": 5
+        })""",
+        R"""(
+            item ::= (([1] ([2-9]) | [2-9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
+            root ::= "[" space item ("," space item){2,4} "]" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "simple regexp",
+        R"""({
+            "type": "string",
+            "pattern": "^abc?d*efg+(hij)?kl$"
+        })""",
+        R"""(
+            root ::= "\"" ("ab" "c"? "d"* "ef" "g"+ ("hij")? "kl") "\"" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "regexp escapes",
+        R"""({
+            "type": "string",
+            "pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
+        })""",
+        R"""(
+            root ::= "\"" ("[]{}()|+*?") "\"" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "regexp quote",
+        R"""({
+            "type": "string",
+            "pattern": "^\"$"
+        })""",
+        R"""(
+            root ::= "\"" ("\"") "\"" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "regexp with top-level alternation",
+        R"""({
+            "type": "string",
+            "pattern": "^A|B|C|D$"
+        })""",
+        R"""(
+            root ::= "\"" ("A" | "B" | "C" | "D") "\"" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "regexp",
+        R"""({
+            "type": "string",
+            "pattern": "^(\\([0-9]{1,3}\\))?[0-9]{3}-[0-9]{4} a{3,5}nd...$"
+        })""",
+        R"""(
+            dot ::= [^\x0A\x0D]
+            root ::= "\"" (("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot) "\"" space
+            root-1 ::= [0-9]
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "required props in original order",
+        R"""({
+            "type": "object",
+            "properties": {
+                "b": {"type": "string"},
+                "c": {"type": "string"},
+                "a": {"type": "string"}
+            },
+            "required": [
+                "a",
+                "b",
+                "c"
+            ],
+            "additionalProperties": false,
+            "definitions": {}
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space string
+            b-kv ::= "\"b\"" space ":" space string
+            c-kv ::= "\"c\"" space ":" space string
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "1 optional prop",
+        R"""({
+            "properties": {
+                "a": {
+                "type": "string"
+                }
+            },
+            "additionalProperties": false
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space string
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "{" space  (a-kv )? "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "N optional props",
+        R"""({
+            "properties": {
+                "a": {"type": "string"},
+                "b": {"type": "string"},
+                "c": {"type": "string"}
+            },
+            "additionalProperties": false
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space string
+            a-rest ::= ( "," space b-kv )? b-rest
+            b-kv ::= "\"b\"" space ":" space string
+            b-rest ::= ( "," space c-kv )?
+            c-kv ::= "\"c\"" space ":" space string
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "{" space  (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "required + optional props each in original order",
+        R"""({
+            "properties": {
+                "b": {"type": "string"},
+                "a": {"type": "string"},
+                "d": {"type": "string"},
+                "c": {"type": "string"}
+            },
+            "required": ["a", "b"],
+            "additionalProperties": false
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space string
+            b-kv ::= "\"b\"" space ":" space string
+            c-kv ::= "\"c\"" space ":" space string
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            d-kv ::= "\"d\"" space ":" space string
+            d-rest ::= ( "," space c-kv )?
+            root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "additional props",
+        R"""({
+            "type": "object",
+            "additionalProperties": {"type": "array", "items": {"type": "number"}}
+        })""",
+        R"""(
+            additional-kv ::= string ":" space additional-value
+            additional-value ::= "[" space (number ("," space number)*)? "]" space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "{" space  (additional-kv ( "," space additional-kv )* )? "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "additional props (true)",
+        R"""({
+            "type": "object",
+            "additionalProperties": true
+        })""",
+        R"""(
+            array ::= "[" space ( value ("," space value)* )? "]" space
+            boolean ::= ("true" | "false") space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            null ::= "null" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+            root ::= object
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+            value ::= object | array | string | number | boolean | null
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "additional props (implicit)",
+        R"""({
+            "type": "object"
+        })""",
+        R"""(
+            array ::= "[" space ( value ("," space value)* )? "]" space
+            boolean ::= ("true" | "false") space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            null ::= "null" space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+            root ::= object
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+            value ::= object | array | string | number | boolean | null
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "empty w/o additional props",
+        R"""({
+            "type": "object",
+            "additionalProperties": false
+        })""",
+        R"""(
+            root ::= "{" space  "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "required + additional props",
+        R"""({
+            "type": "object",
+            "properties": {
+                "a": {"type": "number"}
+            },
+            "required": ["a"],
+            "additionalProperties": {"type": "string"}
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space number
+            additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space string
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "{" space a-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "optional + additional props",
+        R"""({
+            "type": "object",
+            "properties": {
+                "a": {"type": "number"}
+            },
+            "additionalProperties": {"type": "number"}
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space number
+            a-rest ::= ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space number
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "{" space  (a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "required + optional + additional props",
+        R"""({
+            "type": "object",
+            "properties": {
+                "and": {"type": "number"},
+                "also": {"type": "number"}
+            },
+            "required": ["and"],
+            "additionalProperties": {"type": "number"}
+        })""",
+        R"""(
+            additional-k ::= ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space number
+            also-kv ::= "\"also\"" space ":" space number
+            also-rest ::= ( "," space additional-kv )*
+            and-kv ::= "\"and\"" space ":" space number
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "{" space and-kv ( "," space ( also-kv also-rest | additional-kv ( "," space additional-kv )* ) )? "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "optional props with empty name",
+        R"""({
+            "properties": {
+                "": {"type": "integer"},
+                "a": {"type": "integer"}
+            },
+            "additionalProperties": {"type": "integer"}
+        })""",
+        R"""(
+            -kv ::= "\"\"" space ":" space root
+            -rest ::= ( "," space a-kv )? a-rest
+            a-kv ::= "\"a\"" space ":" space integer
+            a-rest ::= ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] char+ | [^"a] char* ) ["] space
+            additional-kv ::= additional-k ":" space integer
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            integer ::= ("-"? integral-part) space
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= ("-"? integral-part) space
+            root0 ::= "{" space  (-kv -rest | a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "optional props with nested names",
+        R"""({
+            "properties": {
+                "a": {"type": "integer"},
+                "aa": {"type": "integer"}
+            },
+            "additionalProperties": {"type": "integer"}
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space integer
+            a-rest ::= ( "," space aa-kv )? aa-rest
+            aa-kv ::= "\"aa\"" space ":" space integer
+            aa-rest ::= ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] ([a] char+ | [^"a] char*) | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space integer
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            integer ::= ("-"? integral-part) space
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= "{" space  (a-kv a-rest | aa-kv aa-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "optional props with common prefix",
+        R"""({
+            "properties": {
+                "ab": {"type": "integer"},
+                "ac": {"type": "integer"}
+            },
+            "additionalProperties": {"type": "integer"}
+        })""",
+        R"""(
+            ab-kv ::= "\"ab\"" space ":" space integer
+            ab-rest ::= ( "," space ac-kv )? ac-rest
+            ac-kv ::= "\"ac\"" space ":" space integer
+            ac-rest ::= ( "," space additional-kv )*
+            additional-k ::= ["] ( [a] ([b] char+ | [c] char+ | [^"bc] char*) | [^"a] char* )? ["] space
+            additional-kv ::= additional-k ":" space integer
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            integer ::= ("-"? integral-part) space
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            root ::= "{" space  (ab-kv ab-rest | ac-kv ac-rest | additional-kv ( "," space additional-kv )* )? "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "top-level $ref",
+        R"""({
+            "$ref": "#/definitions/foo",
+            "definitions": {
+                "foo": {
+                    "type": "object",
+                    "properties": {
+                        "a": {
+                            "type": "string"
+                        }
+                    },
+                    "required": [
+                        "a"
+                    ],
+                    "additionalProperties": false
+                }
+            }
+        })""",
+        R"""(
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            ref-definitions-foo ::= "{" space ref-definitions-foo-a-kv "}" space
+            ref-definitions-foo-a-kv ::= "\"a\"" space ":" space string
+            root ::= ref-definitions-foo
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "anyOf",
+        R"""({
+            "anyOf": [
+                {"$ref": "#/definitions/foo"},
+                {"$ref": "#/definitions/bar"}
+            ],
+            "definitions": {
+                "foo": {
+                    "properties": {"a": {"type": "number"}}
+                },
+                "bar": {
+                    "properties": {"b": {"type": "number"}}
+                }
+            },
+            "type": "object"
+        })""",
+        R"""(
+            alternative-0 ::= ref-definitions-foo
+            alternative-1 ::= ref-definitions-bar
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            ref-definitions-bar ::= "{" space  (ref-definitions-bar-b-kv )? "}" space
+            ref-definitions-bar-b-kv ::= "\"b\"" space ":" space number
+            ref-definitions-foo ::= "{" space  (ref-definitions-foo-a-kv )? "}" space
+            ref-definitions-foo-a-kv ::= "\"a\"" space ":" space number
+            root ::= alternative-0 | alternative-1
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "anyOf $ref",
+        R"""({
+            "properties": {
+                "a": {
+                    "anyOf": [
+                        {"type": "string"},
+                        {"type": "number"}
+                    ]
+                },
+                "b": {
+                    "anyOf": [
+                        {"$ref": "#/properties/a/anyOf/0"},
+                        {"type": "boolean"}
+                    ]
+                }
+            },
+            "type": "object"
+        })""",
+        R"""(
+            a ::= string | number
+            a-kv ::= "\"a\"" space ":" space a
+            a-rest ::= ( "," space b-kv )?
+            b ::= b-0 | boolean
+            b-0 ::= string
+            b-kv ::= "\"b\"" space ":" space b
+            boolean ::= ("true" | "false") space
+            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "{" space  (a-kv a-rest | b-kv )? "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            string ::= "\"" char* "\"" space
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "mix of allOf, anyOf and $ref (similar to https://json.schemastore.org/tsconfig.json)",
+        R"""({
+            "allOf": [
+                {"$ref": "#/definitions/foo"},
+                {"$ref": "#/definitions/bar"},
+                {
+                "anyOf": [
+                    {"$ref": "#/definitions/baz"},
+                    {"$ref": "#/definitions/bam"}
+                ]
+                }
+            ],
+            "definitions": {
+                "foo": {
+                    "properties": {"a": {"type": "number"}}
+                },
+                "bar": {
+                    "properties": {"b": {"type": "number"}}
+                },
+                "bam": {
+                    "properties": {"c": {"type": "number"}}
+                },
+                "baz": {
+                    "properties": {"d": {"type": "number"}}
+                }
+            },
+            "type": "object"
+        })""",
+        R"""(
+            a-kv ::= "\"a\"" space ":" space number
+            b-kv ::= "\"b\"" space ":" space number
+            c-kv ::= "\"c\"" space ":" space number
+            d-kv ::= "\"d\"" space ":" space number
+            d-rest ::= ( "," space c-kv )?
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "allOf with enum schema",
+        R"""({
+            "allOf": [
+                {"$ref": "#/definitions/foo"}
+            ],
+            "definitions": {
+                "foo": {
+                    "type": "string",
+                    "enum": ["a", "b"]
+                }
+            }
+        })""",
+        R"""(
+            root ::= ("\"a\"" | "\"b\"") space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "allOf with multiple enum schemas",
+        R"""({
+            "allOf": [
+                {"$ref": "#/definitions/foo"},
+                {"$ref": "#/definitions/bar"}
+            ],
+            "definitions": {
+                "foo": {
+                    "type": "string",
+                    "enum": ["a", "b", "c"]
+                },
+                "bar": {
+                    "type": "string",
+                    "enum": ["b", "c", "d"]
+                }
+            }
+        })""",
+        R"""(
+            root ::= ("\"b\"" | "\"c\"") space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "conflicting names",
+        R"""({
+            "type": "object",
+            "properties": {
+                "number": {
+                "type": "object",
+                "properties": {
+                    "number": {
+                    "type": "object",
+                        "properties": {
+                            "root": {
+                                "type": "number"
+                            }
+                        },
+                        "required": [
+                            "root"
+                        ],
+                        "additionalProperties": false
+                    }
+                },
+                "required": [
+                    "number"
+                ],
+                "additionalProperties": false
+                }
+            },
+            "required": [
+                "number"
+            ],
+            "additionalProperties": false,
+            "definitions": {}
+        })""",
+        R"""(
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
+            number- ::= "{" space number-number-kv "}" space
+            number-kv ::= "\"number\"" space ":" space number-
+            number-number ::= "{" space number-number-root-kv "}" space
+            number-number-kv ::= "\"number\"" space ":" space number-number
+            number-number-root-kv ::= "\"root\"" space ":" space number
+            root ::= "{" space number-kv "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
+    test({
+        SUCCESS,
+        "literal string with escapes",
+        R"""({
+            "properties": {
+                "code": {
+                    "const": " \r \n \" \\ ",
+                    "description": "Generated code",
+                    "title": "Code",
+                    "type": "string"
+                }
+            },
+            "required": [
+                "code"
+            ],
+            "title": "DecoderResponse",
+            "type": "object"
+        })""",
+        R"""(
+            code ::= "\" \\r \\n \\\" \\\\ \"" space
+            code-kv ::= "\"code\"" space ":" space code
+            root ::= "{" space code-kv "}" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+}
+
+static void test_resolves_to_string() {
+    fprintf(stderr, "#\n# Testing resolves_to_string\n#\n");
+
+    auto test = [](const std::string & name, const std::string & schema_str, bool expected) {
+        fprintf(stderr, "- %s\n", name.c_str());
+        common_schema_info info;
+        auto schema = nlohmann::ordered_json::parse(schema_str);
+        info.resolve_refs(schema);
+        bool result = info.resolves_to_string(schema);
+        if (result != expected) {
+            fprintf(stderr, "#\n# Test '%s' failed.\n#\n", name.c_str());
+            fprintf(stderr, "Schema: %s\n", schema_str.c_str());
+            fprintf(stderr, "Expected: %s, Got: %s\n", expected ? "true" : "false", result ? "true" : "false");
+            assert(false);
+        }
+    };
+
+    // Basic type checks
+    test("type string", R"({"type": "string"})", true);
+    test("type integer", R"({"type": "integer"})", false);
+    test("type number", R"({"type": "number"})", false);
+    test("type boolean", R"({"type": "boolean"})", false);
+    test("type object", R"({"type": "object"})", false);
+    test("type array", R"({"type": "array"})", false);
+
+    // Type array (nullable string)
+    test("type array with string", R"({"type": ["string", "null"]})", true);
+    test("type array without string", R"({"type": ["integer", "null"]})", false);
+
+    // String-specific keywords
+    test("minLength implies string", R"({"minLength": 1})", true);
+    test("maxLength implies string", R"({"maxLength": 10})", true);
+    test("pattern implies string", R"({"pattern": "^[a-z]+$"})", true);
+
+    // Format
+    test("format date", R"({"format": "date"})", true);
+    test("format uuid", R"({"format": "uuid"})", true);
+    test("format email", R"({"format": "email"})", true);
+
+    // Const
+    test("const string", R"({"const": "hello"})", true);
+    test("const number", R"({"const": 123})", false);
+
+    // Enum
+    test("enum with strings", R"({"enum": ["a", "b", "c"]})", true);
+    test("enum with numbers", R"({"enum": [1, 2, 3]})", false);
+    test("enum mixed with string", R"({"enum": [1, "a", null]})", true);
+
+    // anyOf
+    test("anyOf with string", R"({"anyOf": [{"type": "string"}, {"type": "integer"}]})", true);
+    test("anyOf without string", R"({"anyOf": [{"type": "integer"}, {"type": "boolean"}]})", false);
+
+    // oneOf
+    test("oneOf with string", R"({"oneOf": [{"type": "string"}, {"type": "number"}]})", true);
+    test("oneOf without string", R"({"oneOf": [{"type": "object"}, {"type": "array"}]})", false);
+
+    // allOf - all must be strings
+    test("allOf all strings", R"({"allOf": [{"type": "string"}, {"minLength": 1}]})", true);
+    test("allOf mixed types", R"({"allOf": [{"type": "string"}, {"type": "integer"}]})", false);
+
+    // $ref
+    test("$ref to string",
+        R"({"$ref": "#/$defs/str", "$defs": {"str": {"type": "string"}}})", true);
+    test("$ref to integer",
+        R"({"$ref": "#/$defs/num", "$defs": {"num": {"type": "integer"}}})", false);
+
+    // Nested
+    test("nested anyOf with string",
+        R"({"anyOf": [{"anyOf": [{"type": "integer"}, {"type": "string"}]}, {"type": "boolean"}]})", true);
+
+    fprintf(stderr, "All resolves_to_string tests passed!\n");
+}
+
+int main() {
+    fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
+    fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
+
+    test_resolves_to_string();
+
+    test_all("C++", [](const TestCase & tc) {
+        try {
+            tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
+            tc.verify_status(SUCCESS);
+        } catch (const std::invalid_argument & ex) {
+            fprintf(stderr, "Error: %s\n", ex.what());
+            tc.verify_status(FAILURE);
+        }
+    });
+
+    if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {
+        fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m");
+    } else {
+        if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
+            test_all("Python", [](const TestCase & tc) {
+                write("test-json-schema-input.tmp", tc.schema);
+                tc.verify_status(std::system(
+                    "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
+                tc.verify(read("test-grammar-output.tmp"));
+            });
+        } else {
+            fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
+        }
+
+        if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
+            test_all("JavaScript", [](const TestCase & tc) {
+                write("test-json-schema-input.tmp", tc.schema);
+                tc.verify_status(std::system(
+                    "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
+                tc.verify(read("test-grammar-output.tmp"));
+            });
+        } else {
+            fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
+        }
+    }
+
+    test_all("Check Expectations Validity", [](const TestCase & tc) {
+        if (tc.expected_status == SUCCESS) {
+            tc.verify_expectation_parseable();
+        }
+    });
+}
diff --git a/llama.cpp/tests/test-llama-grammar.cpp b/llama.cpp/tests/test-llama-grammar.cpp
new file mode 100644
index 0000000..fd45d5a
--- /dev/null
+++ b/llama.cpp/tests/test-llama-grammar.cpp
@@ -0,0 +1,408 @@
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include "llama.h"
+
+#include "../src/llama-grammar.h"
+
+#include <cassert>
+#include <stdexcept>
+
+int main()
+{
+    llama_grammar_parser parsed_grammar;
+
+    std::vector<std::pair<std::string, uint32_t>> expected = {
+        {"expr", 2},
+        {"expr_6", 6},
+        {"expr_7", 7},
+        {"ident", 8},
+        {"ident_10", 10},
+        {"num", 9},
+        {"num_11", 11},
+        {"root", 0},
+        {"root_1", 1},
+        {"root_5", 5},
+        {"term", 4},
+        {"ws", 3},
+        {"ws_12", 12},
+    };
+
+    std::vector<std::vector<llama_grammar_element>> expected_rules = {
+        {{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_END, 0}},
+        {
+            {LLAMA_GRETYPE_RULE_REF, 2},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_RULE_REF, 4},
+            {LLAMA_GRETYPE_CHAR, 10},
+            {LLAMA_GRETYPE_END, 0},
+        },
+        {{LLAMA_GRETYPE_RULE_REF, 4}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_END, 0}},
+        {{LLAMA_GRETYPE_RULE_REF, 12}, {LLAMA_GRETYPE_END, 0}},
+        {
+            {LLAMA_GRETYPE_RULE_REF, 8},
+            {LLAMA_GRETYPE_ALT, 0},
+            {LLAMA_GRETYPE_RULE_REF, 9},
+            {LLAMA_GRETYPE_ALT, 0},
+            {LLAMA_GRETYPE_CHAR, 40},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_RULE_REF, 2},
+            {LLAMA_GRETYPE_CHAR, 41},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_END, 0},
+        },
+        {{LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_END, 0}},
+        {
+            {LLAMA_GRETYPE_CHAR, 45},
+            {LLAMA_GRETYPE_CHAR_ALT, 43},
+            {LLAMA_GRETYPE_CHAR_ALT, 42},
+            {LLAMA_GRETYPE_CHAR_ALT, 47},
+            {LLAMA_GRETYPE_RULE_REF, 4},
+            {LLAMA_GRETYPE_END, 0},
+        },
+        {{LLAMA_GRETYPE_RULE_REF, 6}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_END, 0}},
+        {
+            {LLAMA_GRETYPE_CHAR, 97},
+            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
+            {LLAMA_GRETYPE_RULE_REF, 10},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_END, 0},
+        },
+        {{LLAMA_GRETYPE_RULE_REF, 11}, {LLAMA_GRETYPE_RULE_REF, 3}, {LLAMA_GRETYPE_END, 0}},
+        {
+            {LLAMA_GRETYPE_CHAR, 97},
+            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
+            {LLAMA_GRETYPE_CHAR_ALT, 48},
+            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+            {LLAMA_GRETYPE_CHAR_ALT, 95},
+            {LLAMA_GRETYPE_RULE_REF, 10},
+            {LLAMA_GRETYPE_ALT, 0},
+            {LLAMA_GRETYPE_END, 0},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 48},
+            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+            {LLAMA_GRETYPE_RULE_REF, 11},
+            {LLAMA_GRETYPE_ALT, 0},
+            {LLAMA_GRETYPE_CHAR, 48},
+            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+            {LLAMA_GRETYPE_END, 0},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 32},
+            {LLAMA_GRETYPE_CHAR_ALT, 9},
+            {LLAMA_GRETYPE_CHAR_ALT, 10},
+            {LLAMA_GRETYPE_RULE_REF, 12},
+            {LLAMA_GRETYPE_ALT, 0},
+            {LLAMA_GRETYPE_END, 0},
+        },
+    };
+
+    for (auto pair : expected)
+    {
+        parsed_grammar.symbol_ids[pair.first] = pair.second;
+    }
+
+    for (auto rule : expected_rules)
+    {
+        parsed_grammar.rules.emplace_back();
+        for (auto element : rule)
+        {
+            parsed_grammar.rules.back().push_back(element);
+        }
+    }
+
+    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    if (grammar == nullptr) {
+        throw std::runtime_error("Failed to initialize llama_grammar");
+    }
+
+    std::vector<std::vector<llama_grammar_element>> expected_stacks = {
+        {
+            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_CHAR, 97},
+        },
+        {
+            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_CHAR, 40},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_CHAR, 97},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_CHAR, 40},
+        }};
+
+    auto index = 0;
+    for (const llama_grammar_stack & stack : llama_grammar_get_stacks(grammar))
+    {
+        // compare stack to expected_stack
+        for (uint32_t i = 0; i < stack.size(); i++)
+        {
+            const llama_grammar_element * element = stack[i];
+            const llama_grammar_element & expected_element = expected_stacks[index][i];
+
+            // pretty print error message before asserting
+            if (expected_element.type != element->type || expected_element.value != element->value)
+            {
+                fprintf(stderr, "index: %d\n", index);
+                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
+                fprintf(stderr, "actual_element: %d, %u\n", element->type, element->value);
+                fprintf(stderr, "expected_element != actual_element\n");
+            }
+
+            assert(expected_element.type == element->type && expected_element.value == element->value);
+        }
+        index++;
+    }
+
+    std::vector<llama_grammar_candidate> next_candidates;
+    next_candidates.resize(24);
+
+    for (size_t i = 0; i < 24; ++i)
+    {
+        uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
+        cp[0] = 37 + i;
+        cp[1] = 0;
+        next_candidates[i] = {i, cp, {}, 0};
+    }
+
+    std::vector<std::vector<std::pair<uint32_t, uint16_t>>> expected_reject = {
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {3, 40},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {11, 48},
+            {12, 49},
+            {13, 50},
+            {14, 51},
+            {15, 52},
+            {16, 53},
+            {17, 54},
+            {18, 55},
+            {19, 56},
+            {20, 57},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {3, 40},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {3, 40},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {11, 48},
+            {12, 49},
+            {13, 50},
+            {14, 51},
+            {15, 52},
+            {16, 53},
+            {17, 54},
+            {18, 55},
+            {19, 56},
+            {20, 57},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {3, 40},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {11, 48},
+            {12, 49},
+            {13, 50},
+            {14, 51},
+            {15, 52},
+            {16, 53},
+            {17, 54},
+            {18, 55},
+            {19, 56},
+            {20, 57},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {3, 40},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {3, 40},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {11, 48},
+            {12, 49},
+            {13, 50},
+            {14, 51},
+            {15, 52},
+            {16, 53},
+            {17, 54},
+            {18, 55},
+            {19, 56},
+            {20, 57},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+    };
+
+    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
+
+    std::vector<std::vector<llama_grammar_candidate>> all_rejects;
+
+    for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
+    {
+        rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
+        all_rejects.push_back(rejects);
+    }
+
+    index = 0;
+    for (auto rej : all_rejects)
+    {
+        for (uint32_t i = 0; i < rej.size(); i++)
+        {
+            auto element = rej[i];
+            auto expected_element = expected_reject[index][i];
+            assert(element.index == expected_element.first && *element.code_points == expected_element.second);
+        }
+        index++;
+    }
+
+    for (auto &candidate : next_candidates)
+    {
+        delete[] candidate.code_points;
+        candidate.code_points = nullptr;
+    }
+
+    llama_grammar_free_impl(grammar);
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-log.cpp b/llama.cpp/tests/test-log.cpp
new file mode 100644
index 0000000..306f28c
--- /dev/null
+++ b/llama.cpp/tests/test-log.cpp
@@ -0,0 +1,39 @@
+#include "log.h"
+
+#include <cstdlib>
+#include <thread>
+
+int main() {
+    const int n_thread = 8;
+
+    std::thread threads[n_thread];
+    for (int i = 0; i < n_thread; i++) {
+        threads[i] = std::thread([i]() {
+            const int n_msg = 1000;
+
+            for (int j = 0; j < n_msg; j++) {
+                const int log_type = std::rand() % 4;
+
+                switch (log_type) {
+                    case 0: LOG_INF("Thread %d: %d\n", i, j); break;
+                    case 1: LOG_WRN("Thread %d: %d\n", i, j); break;
+                    case 2: LOG_ERR("Thread %d: %d\n", i, j); break;
+                    case 3: LOG_DBG("Thread %d: %d\n", i, j); break;
+                    default:
+                        break;
+                }
+
+                if (rand () % 10 < 5) {
+                    common_log_set_timestamps(common_log_main(), rand() % 2);
+                    common_log_set_prefix    (common_log_main(), rand() % 2);
+                }
+            }
+        });
+    }
+
+    for (int i = 0; i < n_thread; i++) {
+        threads[i].join();
+    }
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-lora-conversion-inference.sh b/llama.cpp/tests/test-lora-conversion-inference.sh
new file mode 100755
index 0000000..e7d6776
--- /dev/null
+++ b/llama.cpp/tests/test-lora-conversion-inference.sh
@@ -0,0 +1,144 @@
+#!/usr/bin/env bash
+set -e
+
+# Array of models to iterate over
+declare -a params=(
+    "Gemma2ForCausalLM 64"
+    "LlamaForCausalLM 64"
+    "Phi3ForCausalLM 64"
+)
+
+MODELS_REPO=lora-tests
+MODELS_REPO_URL=https://huggingface.co/ggml-org/$MODELS_REPO
+COMMIT=c26d5fb85b4070a9e9c4e65d132c783b98086890
+
+# Clone the Hugging Face repository if the directory does not exist
+if [ ! -d "$MODELS_REPO" ]; then
+    echo "Cloning the Hugging Face repository..."
+    git clone $MODELS_REPO_URL --depth 1
+    cd $MODELS_REPO
+    git fetch --depth=1 origin $COMMIT
+    git reset --hard $COMMIT
+    cd -
+else
+    echo "Repository already exists. Skipping clone."
+fi
+
+# Array to store results to print
+results=()
+
+trim_leading_whitespace() {
+    local input_string="$1"
+    echo "${input_string#"${input_string%%[![:space:]]*}"}"
+}
+
+extract_starting_substring() {
+    local reference_string="$1"
+    local target_string="$2"
+
+    local target_length=${#target_string}
+    echo "${reference_string:0:$target_length}"
+}
+
+get_first_word() {
+    local input_string="$1"
+    read -r first_word _ <<< "$input_string"
+    echo "$first_word"
+}
+
+# Load the expected strings
+EXPECTED_BASE_FULL=$(cat $MODELS_REPO/data/pale_blue_dot.txt)
+EXPECTED_LORA_FULL=$(cat $MODELS_REPO/data/bohemian_rhapsody.txt)
+EXPECTED_BASE_FIRST_WORD=$(get_first_word "$EXPECTED_BASE_FULL")
+EXPECTED_LORA_FIRST_WORD=$(get_first_word "$EXPECTED_LORA_FULL")
+
+run_conversion_and_inference_lora() {
+    local model_name=$1
+    local hidden_size=$2
+
+    echo -e "\n\n-------- RUNNING TEST FOR MODEL $model_name --------\n\n"
+
+    # Convert safetensors to gguf
+    echo "Running convert_hf_to_gguf.py for $model_name with hidden_size $hidden_size..."
+    python convert_hf_to_gguf.py $MODELS_REPO/$model_name/hidden_size=$hidden_size/base \
+        --outfile $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+        --outtype f32
+
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running convert_lora_to_gguf.py for $model_name with hidden_size $hidden_size..."
+    python3 convert_lora_to_gguf.py $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora \
+        --base $MODELS_REPO/$model_name/hidden_size=$hidden_size/base \
+        --outtype f32
+
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running llama-export-lora with lora for $model_name with hidden_size $hidden_size..."
+    ./llama-export-lora \
+        -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+        -o $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
+        --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf
+
+    # Run inference
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running llama-completion without lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_BASE=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+        -p "$EXPECTED_BASE_FIRST_WORD" -n 50 --seed 42 --temp 0)
+
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running llama-completion with hot lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_LORA_HOT=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+        --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf \
+        -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
+
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running llama-completion with merged lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_LORA_MERGED=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
+        -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
+
+    # Remove any initial white space
+    OUTPUT_BASE=$(trim_leading_whitespace "$OUTPUT_BASE")
+    OUTPUT_LORA_HOT=$(trim_leading_whitespace "$OUTPUT_LORA_HOT")
+    OUTPUT_LORA_MERGED=$(trim_leading_whitespace "$OUTPUT_LORA_MERGED")
+    # Extract the corresponding substring from full string
+    EXPECTED_BASE=$(extract_starting_substring "$EXPECTED_BASE_FULL" "$OUTPUT_BASE")
+    EXPECTED_LORA=$(extract_starting_substring "$EXPECTED_LORA_FULL" "$OUTPUT_LORA_HOT")
+
+    # Assert output equals the expected output
+    if [[ "$OUTPUT_BASE" != "$EXPECTED_BASE" ]]; then
+        echo "Error: $model_name OUTPUT_BASE does not start with the expected string."
+        echo -e "Out=$OUTPUT_BASE\n\nExp=$EXPECTED_BASE"
+        exit 1
+    fi
+    if [[ "$OUTPUT_LORA_HOT" != "$EXPECTED_LORA" ]]; then
+        echo "Error: $model_name OUTPUT_LORA_HOT does not start with the expected string."
+        echo -e "Out=$OUTPUT_LORA_HOT\n\nExp=$EXPECTED_LORA"
+        exit 1
+    fi
+    if [[ "$OUTPUT_LORA_MERGED" != "$EXPECTED_LORA" ]]; then
+        echo "Error: $model_name OUTPUT_LORA_MERGED does not start with the expected string."
+        echo -e "Out=$OUTPUT_LORA_MERGED\n\nExp=$EXPECTED_LORA"
+        exit 1
+    fi
+
+    # Store the results
+    results+=("
+    \n\033[1mResults for $model_name with hidden_size $hidden_size:\033[0m
+    \n\033[32m  • Base:\n$OUTPUT_BASE
+    \n\033[34m  • Lora hot:\n$OUTPUT_LORA_HOT
+    \n\033[36m  • Lora merged:\n$OUTPUT_LORA_MERGED
+    \n \033[0m
+    ")
+
+    echo "All tests passed for $model_name with hidden_size $hidden_size!"
+}
+
+# Run test for each model
+for param in "${params[@]}"; do
+    run_conversion_and_inference_lora $param
+done
+
+# Print results
+echo -e "\n\n---------------------------\n\n"
+echo -e "\n\033[1mSummary of All Results:\033[0m"
+for result in "${results[@]}"; do
+    echo -e "$result"
+done
diff --git a/llama.cpp/tests/test-model-load-cancel.cpp b/llama.cpp/tests/test-model-load-cancel.cpp
new file mode 100644
index 0000000..9095826
--- /dev/null
+++ b/llama.cpp/tests/test-model-load-cancel.cpp
@@ -0,0 +1,27 @@
+#include "llama.h"
+#include "get-model.h"
+
+#include <cstdlib>
+
+int main(int argc, char *argv[] ) {
+    auto * model_path = get_model_or_exit(argc, argv);
+    auto * file = fopen(model_path, "r");
+    if (file == nullptr) {
+        fprintf(stderr, "no model at '%s' found\n", model_path);
+        return EXIT_FAILURE;
+    }
+
+    fprintf(stderr, "using '%s'\n", model_path);
+    fclose(file);
+
+    llama_backend_init();
+    auto params = llama_model_params{};
+    params.use_mmap = false;
+    params.progress_callback = [](float progress, void * ctx){
+        (void) ctx;
+        return progress > 0.50;
+    };
+    auto * model = llama_model_load_from_file(model_path, params);
+    llama_backend_free();
+    return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/llama.cpp/tests/test-mtmd-c-api.c b/llama.cpp/tests/test-mtmd-c-api.c
new file mode 100644
index 0000000..02e762e
--- /dev/null
+++ b/llama.cpp/tests/test-mtmd-c-api.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <assert.h>
+
+#include "mtmd.h"
+
+int main(void) {
+    printf("\n\nTesting libmtmd C API...\n");
+    printf("--------\n\n");
+
+    struct mtmd_context_params params = mtmd_context_params_default();
+    printf("Default image marker: %s\n", params.image_marker);
+
+    mtmd_input_chunks * chunks = mtmd_test_create_input_chunks();
+
+    if (!chunks) {
+        fprintf(stderr, "Failed to create input chunks\n");
+        return 1;
+    }
+
+    size_t n_chunks = mtmd_input_chunks_size(chunks);
+    printf("Number of chunks: %zu\n", n_chunks);
+    assert(n_chunks > 0);
+
+    for (size_t i = 0; i < n_chunks; i++) {
+        const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
+        assert(chunk != NULL);
+        enum mtmd_input_chunk_type type = mtmd_input_chunk_get_type(chunk);
+        printf("Chunk %zu type: %d\n", i, type);
+
+        if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            size_t n_tokens;
+            const llama_token * tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+            printf("    Text chunk with %zu tokens\n", n_tokens);
+            assert(tokens != NULL);
+            assert(n_tokens > 0);
+            for (size_t j = 0; j < n_tokens; j++) {
+                assert(tokens[j] >= 0);
+                printf("    > Token %zu: %d\n", j, tokens[j]);
+            }
+
+        } else if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            const mtmd_image_tokens * image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+            size_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
+            size_t nx = mtmd_image_tokens_get_nx(image_tokens);
+            size_t ny = mtmd_image_tokens_get_ny(image_tokens);
+            const char * id = mtmd_image_tokens_get_id(image_tokens);
+            assert(n_tokens > 0);
+            assert(nx > 0);
+            assert(ny > 0);
+            assert(id != NULL);
+            printf("    Image chunk with %zu tokens\n", n_tokens);
+            printf("    Image size: %zu x %zu\n", nx, ny);
+            printf("    Image ID: %s\n", id);
+        }
+    }
+
+    // Free the chunks
+    mtmd_input_chunks_free(chunks);
+
+    printf("\n\nDONE: test libmtmd C API...\n");
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-opt.cpp b/llama.cpp/tests/test-opt.cpp
new file mode 100644
index 0000000..8dcb4a7
--- /dev/null
+++ b/llama.cpp/tests/test-opt.cpp
@@ -0,0 +1,1003 @@
+// TODO refactor
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-opt.h"
+
+#include <cmath>
+#include <cinttypes>
+#include <cstring>
+#include <random>
+#include <string>
+#include <thread>
+#include <vector>
+
+#define TEST_LOG(...)       printf(__VA_ARGS__)
+
+static bool almost_equal(const double a, const double b, const double atol) {
+    return fabs(a - b) < atol;
+}
+
+constexpr int64_t ne_datapoint = 2;
+constexpr int64_t ne_label     = 1;
+constexpr int64_t ndata        = 6;
+
+struct helper_ctx_data {
+    std::vector<ggml_opt_dataset_t>   datasets_supervised;
+    std::vector<struct ggml_tensor *> data_batch;
+    std::vector<struct ggml_tensor *> labels_batch;
+
+    ggml_opt_dataset_t       dataset_unsupervised;
+    struct ggml_context    * ctx_static;
+    struct ggml_context    * ctx_compute;
+    struct ggml_opt_params   opt_params;
+    ggml_opt_context_t       opt_ctx;
+    struct ggml_tensor     * inputs;
+    struct ggml_tensor     * weights;
+    struct ggml_tensor     * outputs;
+    ggml_backend_buffer_t    buf;
+    ggml_opt_result_t        result;
+    ggml_opt_result_t        result2;
+};
+
+// These default values make it easier to check optimization results vs. expected values.
+static ggml_opt_optimizer_params helper_get_test_opt_pars(void * userdata) {
+    ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata);
+
+    result.adamw.alpha = 1.0f;
+    result.adamw.beta1 = 0.0f;
+    result.adamw.beta2 = 0.0f;
+    result.adamw.eps   = 0.0f;
+    result.adamw.wd    = 0.0f;
+    result.sgd.wd      = 0.0f;
+    result.sgd.alpha   = 1.0f;
+
+    return result;
+}
+
+static helper_ctx_data helper_get_ctx_data(
+        enum ggml_opt_optimizer_type optim,
+        ggml_backend_sched_t    backend_sched,
+        ggml_backend_t          backend,
+        const bool              init_opt_ctx       = true,
+        const bool              optimizer_defaults = true,
+        int64_t                 nbatch_logical     = 1,
+        int64_t                 nbatch_physical    = 1,
+        enum ggml_opt_loss_type loss_type          = GGML_OPT_LOSS_TYPE_SUM) {
+    std::vector<ggml_opt_dataset_t> datasets(ndata);
+    for (int64_t ndata_shard = 1; ndata_shard <= ndata; ++ndata_shard) {
+        ggml_opt_dataset_t dataset = ggml_opt_dataset_init(
+            GGML_TYPE_F32, GGML_TYPE_F32, ne_datapoint, ne_label, ndata, ndata_shard);
+
+        float * data   = ggml_get_data_f32(ggml_opt_dataset_data(  dataset));
+        float * labels = ggml_get_data_f32(ggml_opt_dataset_labels(dataset));
+
+        for (int64_t idata = 0; idata < ndata; ++idata) {
+            for (int64_t id = 0; id < ne_datapoint; ++id) {
+                data[  idata*ne_datapoint + id] =     16*idata + id;
+            }
+            for (int64_t il = 0; il < ne_label;     ++il) {
+                labels[idata*ne_label     + il] = 16*(16*idata + il);
+            }
+        }
+
+        datasets[ndata_shard-1] = dataset;
+    }
+
+    ggml_opt_dataset_t dataset_unsupervised = ggml_opt_dataset_init(
+        GGML_TYPE_F32, GGML_TYPE_F32, 1, 0, ndata, /*ndata_shard =*/ 1);
+
+    float * data = ggml_get_data_f32(ggml_opt_dataset_data(dataset_unsupervised));
+
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        data[idata] = idata;
+    }
+
+    struct ggml_context * ctx_static;
+    struct ggml_context * ctx_compute;
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ (2*ndata + 2)*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_static = ggml_init(params);
+    }
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ GGML_DEFAULT_GRAPH_SIZE*ggml_tensor_overhead() + 3*ggml_graph_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_compute = ggml_init(params);
+    }
+
+    std::vector<struct ggml_tensor *>   data_batch(ndata);
+    std::vector<struct ggml_tensor *> labels_batch(ndata);
+    for (int64_t ndata_batch = 1; ndata_batch <= ndata; ++ndata_batch) {
+        data_batch[ndata_batch-1]   = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, ndata_batch*ne_datapoint);
+        labels_batch[ndata_batch-1] = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, ndata_batch*ne_label);
+    }
+
+    struct ggml_tensor * inputs = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, nbatch_physical);
+    ggml_set_name(inputs, "inputs");
+
+    struct ggml_tensor * weights = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, 1);
+    ggml_set_name(weights, "weights");
+    ggml_set_param(weights);
+
+    struct ggml_tensor * intermediary = ggml_add(ctx_compute, inputs, weights);
+
+    struct ggml_tensor * outputs = ggml_scale(ctx_compute, intermediary, 1.0f);
+    ggml_set_name(outputs, "outputs");
+
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx_static, backend);
+    const float w0 = float(ndata)/2;
+    ggml_backend_tensor_set(weights, &w0, 0, sizeof(float));
+
+    GGML_ASSERT(nbatch_logical % nbatch_physical == 0);
+    const int32_t opt_period = nbatch_logical / nbatch_physical;
+
+    struct ggml_opt_params opt_params = ggml_opt_default_params(backend_sched, loss_type);
+    opt_params.ctx_compute = ctx_compute;
+    opt_params.inputs      = inputs;
+    opt_params.outputs     = outputs;
+    opt_params.opt_period  = opt_period;
+    opt_params.optimizer   = optim;
+    if (!optimizer_defaults) {
+        opt_params.get_opt_pars = helper_get_test_opt_pars;
+    }
+    GGML_ASSERT(opt_params.get_opt_pars);
+    ggml_opt_context_t opt_ctx = init_opt_ctx ? ggml_opt_init(opt_params) : nullptr;
+    GGML_ASSERT(!opt_ctx || ggml_opt_context_optimizer_type(opt_ctx) == opt_params.optimizer);
+
+    ggml_opt_result_t result  = ggml_opt_result_init();
+    ggml_opt_result_t result2 = ggml_opt_result_init();
+
+    return {datasets, data_batch, labels_batch, dataset_unsupervised, ctx_static, ctx_compute, opt_params, opt_ctx, inputs, weights, outputs, buf, result, result2};
+}
+
+static void helper_free_ctx_data(struct helper_ctx_data ctx_data) {
+    ggml_opt_result_free(ctx_data.result);
+    ggml_opt_result_free(ctx_data.result2);
+    ggml_opt_free(ctx_data.opt_ctx);
+    ggml_backend_buffer_free(ctx_data.buf);
+    ggml_free(ctx_data.ctx_static);
+    ggml_free(ctx_data.ctx_compute);
+    for (ggml_opt_dataset_t dataset : ctx_data.datasets_supervised) {
+        ggml_opt_dataset_free(dataset);
+    }
+    ggml_opt_dataset_free(ctx_data.dataset_unsupervised);
+}
+
+static void print_ok(bool subtest_ok) {
+    printf(subtest_ok ? "\033[1;32mOK\033[0m\n" : "\033[1;31mFAIL\033[0m\n");
+}
+
+static void helper_after_test(
+        enum ggml_opt_optimizer_type optim,
+        const char * func, const bool high_level, const std::string options,
+        const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
+    printf("  %s(high_level=%s%s, subtest=%s, optimizer=%s): ",
+           func, high_level ? "yes" : "no", options.c_str(), subtest.c_str(), ggml_opt_optimizer_name(optim));
+    print_ok(subtest_ok);
+    if (subtest_ok)
+        npass++;
+    ntest++;
+}
+
+static void print_ok(const char * func, bool subtest_ok, int & npass, int & ntest, const char * args = "") {
+    printf("  %s(%s): ", func, args);
+    print_ok(subtest_ok);
+    if (subtest_ok)
+        npass++;
+    ++ntest;
+}
+
+static std::pair<int, int> test_dataset(
+    enum ggml_opt_optimizer_type optim,
+    ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool shuffle) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend);
+
+    for (int64_t ndata_shard = 1; ndata_shard <= ndata; ++ndata_shard) {
+        ggml_opt_dataset_t dataset = cd.datasets_supervised[ndata_shard-1];
+
+        if (shuffle) {
+            ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
+        }
+
+        for (int64_t ndata_batch = 1; ndata_batch <= ndata; ++ndata_batch) {
+            if (ndata_batch % ndata_shard != 0) {
+                continue;
+            }
+            bool subtest_ok = true;
+
+            struct ggml_tensor *   data_batch =   cd.data_batch[ndata_batch-1];
+            struct ggml_tensor * labels_batch = cd.labels_batch[ndata_batch-1];
+
+            std::vector<float>   data(ggml_nelements(  data_batch));
+            std::vector<float> labels(ggml_nelements(labels_batch));
+
+            std::vector<int64_t> idata_shuffled;
+            const int64_t nbatches = ndata / ndata_batch;
+            for (int64_t ibatch = 0; ibatch < nbatches; ++ibatch) {
+                ggml_opt_dataset_get_batch(dataset, data_batch, labels_batch, ibatch);
+
+                ggml_backend_tensor_get(  data_batch,   data.data(), 0, ggml_nbytes(  data_batch));
+                ggml_backend_tensor_get(labels_batch, labels.data(), 0, ggml_nbytes(labels_batch));
+
+                for (int64_t idata_batch = 0; idata_batch < ndata_batch; ++idata_batch) {
+                    const int64_t idata = ibatch*ndata_batch + idata_batch;
+                    const int64_t idata_found = data[idata_batch*ne_datapoint] / 16;
+                    subtest_ok = subtest_ok && (shuffle || idata_found == idata);
+                    idata_shuffled.push_back(idata_found);
+
+                    for (int64_t id = 0; id < ne_datapoint; ++id) {
+                        if (data[  idata_batch*ne_datapoint + id] != 16*idata_found + id) {
+                            subtest_ok = false;
+                        }
+                    }
+                    for (int64_t il = 0; il < ne_label;     ++il) {
+                        if (labels[idata_batch*ne_label     + il] != 16*(16*idata_found + il)) {
+                            subtest_ok = false;
+                        }
+                    }
+                }
+            }
+
+            if (!shuffle || ndata % ndata_batch == 0) {
+                const int ndata_max = (ndata / ndata_batch) * ndata_batch;
+
+                for (int64_t idata = 0; subtest_ok && idata < ndata_max; ++idata) {
+                    int ninstances = 0;
+                    for (int64_t id : idata_shuffled) {
+                        ninstances += id == idata;
+                    }
+                    if (ninstances != 1) {
+                        subtest_ok = false;
+                    }
+                }
+            }
+
+            printf("  %s(shuffle=%s, ndata_shard=%" PRId64 ", ndata_batch=%" PRId64 "): ",
+                   __func__, shuffle ? "yes" : "no", ndata_shard, ndata_batch);
+            if (subtest_ok) {
+                printf("\033[1;32mOK\033[0m\n");
+                npass++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ntest++;
+        }
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static std::pair<int, int> test_grad(
+    enum ggml_opt_optimizer_type optim,
+    ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false,
+    /*nbatch_logical =*/ 999999, /*nbatch_physical =*/ 1);
+
+    std::vector<float> grad_history(ndata);
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        grad_history[idata] = NAN;
+    }
+
+    for (int idata = 0; idata < ndata; ++idata) {
+        const float idataf = idata;
+        ggml_opt_alloc(cd.opt_ctx, /*backward =*/ true);
+        // leaked
+        ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+        ggml_opt_eval(cd.opt_ctx, cd.result);
+        ggml_backend_tensor_get(ggml_opt_grad_acc(cd.opt_ctx, cd.weights), grad_history.data() + idata, 0, sizeof(float));
+    }
+
+    {
+        bool subtest_ok = true;
+        for (int idata = 0; idata < ndata; ++idata) {
+            if (grad_history[idata] != idata + 1) {
+                subtest_ok = false;
+            }
+        }
+        printf("  %s(): ", __func__);
+        if (subtest_ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            npass++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+        ntest++;
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static void helper_after_test_forward_backward(
+        enum ggml_opt_optimizer_type optim,
+        const char * func, const bool high_level, const bool shuffle,
+        const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
+    std::string options = ", shuffle=";
+    options += shuffle ? "yes" : "no";
+    helper_after_test(optim, func, high_level, options, subtest, subtest_ok, ntest, npass);
+}
+
+static std::pair<int, int> test_forward_backward(
+        enum ggml_opt_optimizer_type optim,
+        ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level, const bool shuffle) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
+    struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx);
+
+    std::vector<float> loss_history(ndata);
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        loss_history[idata] = NAN;
+    }
+
+    {
+        int64_t ndata;
+        ggml_opt_result_ndata(cd.result, &ndata);
+        double loss;
+        double loss_unc;
+        ggml_opt_result_loss(cd.result, &loss, &loss_unc);
+        double accuracy;
+        double accuracy_unc;
+        ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+        const bool subtest_ok = ndata == 0 && almost_equal(loss, 0.0, 1e-6) && std::isnan(loss_unc) && std::isnan(accuracy) && std::isnan(accuracy_unc);
+        helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "results_initial", subtest_ok, ntest, npass);
+    }
+
+    if (high_level) {
+        ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
+        if (shuffle) {
+            ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
+        }
+        ggml_opt_epoch(cd.opt_ctx, dataset, nullptr, cd.result, 0, nullptr, nullptr);
+    } else {
+        for (int idata = 0; idata < ndata; ++idata) {
+            const float idataf = idata;
+            ggml_opt_alloc(cd.opt_ctx, /*backward =*/ false);
+            ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+            ggml_opt_eval(cd.opt_ctx, cd.result);
+            ggml_backend_tensor_get(loss, loss_history.data() + idata, 0, sizeof(float));
+        }
+    }
+
+    {
+        float weights;
+        ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
+        const bool subtest_ok = almost_equal(weights, ndata/2, 1e-10);
+        helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward", subtest_ok, ntest, npass);
+    }
+    {
+        constexpr double atol = 1e-10;
+
+        int64_t ndata;
+        ggml_opt_result_ndata(cd.result, &ndata);
+        bool subtest_ok = ndata == 6;
+
+        double loss;
+        double loss_unc;
+        ggml_opt_result_loss(cd.result, &loss, &loss_unc);
+        subtest_ok = subtest_ok && almost_equal(loss, 33.0, atol) && almost_equal(loss_unc, sqrt(3.5), atol);
+
+        double accuracy;
+        double accuracy_unc;
+        ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+        subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+        helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "results_after_forward", subtest_ok, ntest, npass);
+    }
+
+    float w0;
+    ggml_backend_tensor_get(cd.weights, &w0, 0, sizeof(float));
+    for (int i = 0; i < 10; ++i) {
+        ggml_opt_alloc(cd.opt_ctx, /*backward =*/ true);
+        // leaked.
+        ggml_opt_eval(cd.opt_ctx, cd.result);
+    }
+    ggml_backend_tensor_set(cd.weights, &w0, 0, sizeof(float));
+
+    ggml_opt_reset(cd.opt_ctx, /*optimizer =*/ false);
+    ggml_opt_result_reset(cd.result);
+
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        loss_history[idata] = NAN;
+    }
+
+    if (high_level) {
+        ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
+        if (shuffle) {
+            ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
+        }
+        ggml_opt_epoch(cd.opt_ctx, dataset, cd.result, nullptr, ndata, nullptr, nullptr);
+    } else {
+        for (int idata = 0; idata < ndata; ++idata) {
+            const float idataf = idata;
+            ggml_opt_alloc(cd.opt_ctx, /*backward =*/ true);
+            ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+            ggml_opt_eval(cd.opt_ctx, cd.result);
+            ggml_backend_tensor_get(loss, loss_history.data() + idata, 0, sizeof(float));
+        }
+    }
+
+    {
+        float weights;
+        ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
+        const bool subtest_ok = almost_equal(weights, -ndata * 0.5, 1e-10);
+        helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward_backward", subtest_ok, ntest, npass);
+    }
+    {
+        int64_t ndata;
+        ggml_opt_result_ndata(cd.result, &ndata);
+        bool subtest_ok = ndata == 6;
+
+        double loss;
+        double loss_unc;
+        ggml_opt_result_loss(cd.result, &loss, &loss_unc);
+        subtest_ok = subtest_ok && almost_equal(loss, 18.0, 1e-10) && (shuffle || loss_unc == 0.0);
+
+        double accuracy;
+        double accuracy_unc;
+        ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+        subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+        helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "result_after_forward_backward", subtest_ok, ntest, npass);
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static std::pair<int, int> test_epoch_vs_fit(
+    enum ggml_opt_optimizer_type optim,
+    ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+    int ntest = 0;
+    int npass = 0;
+
+    float weights_epoch;
+    float weights_fit;
+
+    {
+        struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true);
+        ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
+
+        ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
+        ggml_opt_epoch(cd.opt_ctx, dataset, cd.result, nullptr, ndata, nullptr, nullptr);
+        // leaked.
+
+        ggml_backend_tensor_get(cd.weights, &weights_epoch, 0, ggml_nbytes(cd.weights));
+        helper_free_ctx_data(cd);
+    }
+    {
+        struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ false);
+        ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
+
+        ggml_opt_fit(backend_sched, cd.ctx_compute, cd.inputs, cd.outputs, dataset, GGML_OPT_LOSS_TYPE_SUM,
+                     optim, ggml_opt_get_default_optimizer_params, 1, 1, 0.0f, true);
+
+        ggml_backend_tensor_get(cd.weights, &weights_fit, 0, ggml_nbytes(cd.weights));
+        helper_free_ctx_data(cd);
+    }
+
+    const bool subtest_ok = weights_epoch == weights_fit;
+
+    print_ok(__func__, subtest_ok, npass, ntest);
+
+    return std::make_pair(npass, ntest);
+}
+
+static void helper_after_test_idata_split(
+        enum ggml_opt_optimizer_type optim,
+        const char * func, const bool high_level, const int epoch,
+        const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
+    std::string options = ", epoch=";
+    options += std::to_string(epoch);
+    helper_after_test(optim, func, high_level, options, subtest, subtest_ok, ntest, npass);
+}
+
+static std::pair<int, int> test_idata_split(
+    enum ggml_opt_optimizer_type optim,
+    ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
+    struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx);
+    const int idata_split = ndata * 2/3;
+
+    std::vector<float> loss_history(ndata);
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        loss_history[idata] = NAN;
+    }
+
+    bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+    for (int epoch = 1; epoch <= 4; ++epoch) {
+        if (high_level) {
+            ggml_opt_epoch(cd.opt_ctx, cd.dataset_unsupervised, cd.result, cd.result2, idata_split, nullptr, nullptr);
+        } else {
+            int idata = 0;
+            for (; idata < idata_split; ++idata) {
+                const float idataf = idata;
+                ggml_opt_alloc(cd.opt_ctx, /*backward =*/ true);
+                ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+                ggml_opt_eval(cd.opt_ctx, cd.result);
+                ggml_backend_tensor_get(loss, loss_history.data() + idata, 0, sizeof(float));
+            }
+            for (; idata < ndata; ++idata) {
+                const float idataf = idata;
+                ggml_opt_alloc(cd.opt_ctx, /*backward =*/ false);
+                ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
+                ggml_opt_eval(cd.opt_ctx, cd.result2);
+                ggml_backend_tensor_get(loss, loss_history.data() + idata, 0, sizeof(float));
+            }
+        }
+
+        if (adamw) {
+            float weights;
+            ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
+            const bool subtest_ok = almost_equal(weights, ndata/2 - epoch*idata_split, 1e-10);
+            helper_after_test_idata_split(optim, __func__, high_level, epoch, "weights", subtest_ok, ntest, npass);
+        }
+        if (adamw) {
+            constexpr double atol = 1e-10;
+
+            int64_t ndata_result;
+            ggml_opt_result_ndata(cd.result, &ndata_result);
+            bool subtest_ok = ndata_result == idata_split;
+
+            double loss;
+            double loss_unc;
+            ggml_opt_result_loss(cd.result, &loss, &loss_unc);
+            subtest_ok = subtest_ok && almost_equal(loss, 28.0 - epoch*16.0, atol) && almost_equal(loss_unc, 0.0, atol);
+
+            double accuracy;
+            double accuracy_unc;
+            ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+            subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+            helper_after_test_idata_split(optim, __func__, high_level, epoch, "results_backward", subtest_ok, ntest, npass);
+        }
+        if (adamw) {
+            constexpr double atol = 1e-10;
+
+            int64_t ndata_result;
+            ggml_opt_result_ndata(cd.result2, &ndata_result);
+            bool subtest_ok = ndata_result == ndata - idata_split;
+
+            double loss;
+            double loss_unc;
+            ggml_opt_result_loss(cd.result2, &loss, &loss_unc);
+            subtest_ok = subtest_ok && almost_equal(loss, 15.0 - epoch*8, atol) && almost_equal(loss_unc, sqrt(0.5), atol);
+
+            double accuracy;
+            double accuracy_unc;
+            ggml_opt_result_accuracy(cd.result2, &accuracy, &accuracy_unc);
+            subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+            helper_after_test_idata_split(optim, __func__, high_level, epoch, "results_forward", subtest_ok, ntest, npass);
+        }
+
+        ggml_opt_result_reset(cd.result);
+        ggml_opt_result_reset(cd.result2);
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+static void helper_after_test_gradient_accumulation(
+        enum ggml_opt_optimizer_type optim,
+        const char * func, const int nbatch_physical, const enum ggml_opt_loss_type loss_type, const int epoch,
+        const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
+    std::string options = ", nbatch_physical=";
+    options += std::to_string(nbatch_physical);
+    options += ", loss_type=";
+    options += loss_type == GGML_OPT_LOSS_TYPE_MEAN ? "mean" : "sum";
+    options += ", epoch=";
+    options += std::to_string(epoch);
+    helper_after_test(optim, func, false, options, subtest, subtest_ok, ntest, npass);
+}
+
+static std::pair<int, int> test_gradient_accumulation(
+        enum ggml_opt_optimizer_type optim,
+        ggml_backend_sched_t backend_sched, ggml_backend_t backend, const int32_t nbatch_physical, const enum ggml_opt_loss_type loss_type) {
+    int ntest = 0;
+    int npass = 0;
+
+    struct helper_ctx_data cd = helper_get_ctx_data(
+        optim,
+        backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false, /*nbatch_logical =*/ 6, nbatch_physical, loss_type);
+
+    std::vector<float> grad_history(ndata);
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        grad_history[idata] = NAN;
+    }
+
+    bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+    if (adamw)
+    for (int epoch = 1; epoch <= 4; ++epoch) {
+        if (nbatch_physical == 1) {
+            for (int idata = 0; idata < ndata; ++idata) {
+                const float idataf = idata;
+                ggml_opt_alloc(cd.opt_ctx, /*backward =*/ true);
+                ggml_backend_tensor_set(cd.inputs, &idataf, 0, 1*sizeof(float));
+                ggml_opt_eval(cd.opt_ctx, cd.result);
+                ggml_backend_tensor_get(ggml_opt_grad_acc(cd.opt_ctx, cd.weights), grad_history.data() + idata, 0, 1*sizeof(float));
+            }
+        } else if (nbatch_physical == 2) {
+            for (int idata = 0; idata < ndata; idata += 2) {
+                const float idataf[2] = {float(idata + 0), float(idata + 1)};
+                ggml_opt_alloc(cd.opt_ctx, /*backward =*/ true);
+                ggml_backend_tensor_set(cd.inputs, idataf, 0, 2*sizeof(float));
+                ggml_opt_eval(cd.opt_ctx, cd.result);
+
+                grad_history[idata + 0] = 0.0f;
+                ggml_backend_tensor_get(ggml_opt_grad_acc(cd.opt_ctx, cd.weights), grad_history.data() + idata + 1, 0, 1*sizeof(float));
+            }
+        } else {
+            GGML_ASSERT(false);
+        }
+
+        {
+            GGML_ASSERT(ndata == 6);
+            constexpr double atol = 1e-6;
+            bool subtest_ok = true;
+            if (loss_type == GGML_OPT_LOSS_TYPE_SUM) {
+                if (nbatch_physical == 1) {
+                    subtest_ok = subtest_ok && almost_equal(grad_history[0], 1.0, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[2], 3.0, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[4], 5.0, atol);
+                } else {
+                    subtest_ok = subtest_ok && almost_equal(grad_history[0], 0.0, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[2], 0.0, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[4], 0.0, atol);
+                }
+                subtest_ok = subtest_ok && almost_equal(grad_history[1], 2.0, atol);
+                subtest_ok = subtest_ok && almost_equal(grad_history[3], 4.0, atol);
+                subtest_ok = subtest_ok && almost_equal(grad_history[5], 6.0, atol);
+            } else if (loss_type == GGML_OPT_LOSS_TYPE_MEAN) {
+                if (nbatch_physical == 1) {
+                    subtest_ok = subtest_ok && almost_equal(grad_history[0], 1.0/ndata, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[2], 3.0/ndata, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[4], 5.0/ndata, atol);
+                } else {
+                    subtest_ok = subtest_ok && almost_equal(grad_history[0], 0.0/ndata, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[2], 0.0/ndata, atol);
+                    subtest_ok = subtest_ok && almost_equal(grad_history[4], 0.0/ndata, atol);
+                }
+                subtest_ok = subtest_ok && almost_equal(grad_history[1], 2.0/ndata, atol);
+                subtest_ok = subtest_ok && almost_equal(grad_history[3], 4.0/ndata, atol);
+                subtest_ok = subtest_ok && almost_equal(grad_history[5], 6.0/ndata, atol);
+            } else {
+                GGML_ASSERT(false);
+            }
+            helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "grads", subtest_ok, ntest, npass);
+        }
+        bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+        if (adamw) {
+            constexpr double atol = 1e-6;
+            float weights;
+            ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
+            const bool subtest_ok = almost_equal(weights, (ndata/2) - epoch, atol);
+            helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "weights", subtest_ok, ntest, npass);
+        }
+        {
+            constexpr double atol = 1e-6;
+            int64_t ndata_result;
+            ggml_opt_result_ndata(cd.result, &ndata_result);
+            bool subtest_ok = almost_equal(ndata_result, ndata/nbatch_physical, atol);
+
+            double loss;
+            ggml_opt_result_loss(cd.result, &loss, /*loss_unc =*/ nullptr);
+            if (loss_type == GGML_OPT_LOSS_TYPE_SUM) {
+                subtest_ok = subtest_ok && almost_equal(loss, (39.0 - epoch*6.0), atol);
+            } else if (loss_type == GGML_OPT_LOSS_TYPE_MEAN) {
+                subtest_ok = subtest_ok && almost_equal(loss, (39.0 - epoch*6.0) / ndata, atol);
+            } else {
+                GGML_ASSERT(false);
+            }
+
+            double accuracy;
+            double accuracy_unc;
+            ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
+            subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
+
+            helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "results", subtest_ok, ntest, npass);
+        }
+
+        ggml_opt_result_reset(cd.result);
+    }
+
+    helper_free_ctx_data(cd);
+
+    return std::make_pair(npass, ntest);
+}
+
+float constexpr g_sgd_lr = 1e-4f;
+
+int constexpr g_sgd_epochs = 900;
+
+static ggml_opt_optimizer_params helper_get_regression_opt_pars(void * userdata) {
+    int64_t epoch = *(int64_t*)userdata;
+    ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
+    result.adamw.alpha = 0.1f;
+    result.sgd.alpha = g_sgd_lr * std::pow(.99, 1000 * (double)epoch / g_sgd_epochs);
+    result.sgd.wd = 1e-10;
+    return result;
+}
+
+static std::pair<int, int> test_regression(
+        enum ggml_opt_optimizer_type optim,
+        ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
+    int ntest = 0;
+    int npass = 0;
+
+    // Test for simple regression with f(x) = a*x + b
+
+    constexpr int64_t ndata_regression = 201;
+    constexpr float a_true = 1.2f;
+    constexpr float b_true = 3.4f;
+
+    std::mt19937 gen(12345);
+    std::normal_distribution<float> nd{0.0f, 0.1f};
+
+    ggml_opt_dataset_t dataset = ggml_opt_dataset_init(
+        GGML_TYPE_F32, GGML_TYPE_F32, 1, 1, ndata_regression, ndata_regression);
+
+    float * data   = ggml_get_data_f32(ggml_opt_dataset_data(  dataset));
+    float * labels = ggml_get_data_f32(ggml_opt_dataset_labels(dataset));
+
+    constexpr float x_min = -100.0f;
+    constexpr float x_max =  100.0f;
+
+    for (int64_t idata = 0; idata < ndata_regression; ++idata) {
+        const float x = x_min + (x_max - x_min) * idata/(ndata_regression-1);
+        const float y = a_true*x + b_true + nd(gen);
+
+        data[idata]   = x;
+        labels[idata] = y;
+    }
+
+    struct ggml_context * ctx_static;
+    struct ggml_context * ctx_compute;
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ 3*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_static = ggml_init(params);
+    }
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ GGML_DEFAULT_GRAPH_SIZE*ggml_tensor_overhead() + 3*ggml_graph_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_compute = ggml_init(params);
+    }
+
+    // The first dimension is the dimension of the datapoints, the second dimension is the number of datapoints.
+    struct ggml_tensor * x = ggml_new_tensor_2d(ctx_static, GGML_TYPE_F32, 1, ndata_regression);
+    ggml_set_name(x, "x");
+
+    struct ggml_tensor * a = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, 1);
+    ggml_set_name(a, "a");
+    ggml_set_param(a);
+
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx_static, GGML_TYPE_F32, 1);
+    ggml_set_name(b, "b");
+    ggml_set_param(b);
+
+    struct ggml_tensor * f = ggml_add(ctx_compute, ggml_mul(ctx_compute, x, a), b);
+    ggml_set_name(f, "f");
+
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx_static, backend);
+    const float a0 = 1.0f;
+    const float b0 = 3.0f;
+    ggml_backend_tensor_set(a, &a0, 0, sizeof(float));
+    ggml_backend_tensor_set(b, &b0, 0, sizeof(float));
+
+    bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+    int64_t const n_epoch = adamw ? 100 : g_sgd_epochs;
+    ggml_opt_fit(backend_sched, ctx_compute, x, f, dataset, GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR, optim,
+                 helper_get_regression_opt_pars, n_epoch, ndata_regression, 0.0f, true);
+
+    {
+        float a_fit;
+        ggml_backend_tensor_get(a, &a_fit, 0, sizeof(float));
+        float b_fit;
+        ggml_backend_tensor_get(b, &b_fit, 0, sizeof(float));
+        float tol = adamw ? 1e-2 : 5e-2;
+        const bool aok = almost_equal(a_fit, a_true, tol);
+        const bool bok = almost_equal(b_fit, b_true, tol);
+        const bool subtest_ok = aok && bok;
+        print_ok(__func__, adamw ? subtest_ok : true, npass, ntest, "subtest=weights");
+    }
+
+    ggml_backend_buffer_free(buf);
+    ggml_free(ctx_static);
+    ggml_opt_dataset_free(dataset);
+
+    return std::make_pair(npass, ntest);
+}
+
+static std::pair<int, int> test_backend(
+    ggml_backend_sched_t backend_sched, ggml_backend_t backend, enum ggml_opt_optimizer_type optim) {
+    int npass = 0;
+    int ntest = 0;
+
+    for (bool shuffle : {false, true}) {
+        std::pair<int, int> partial = test_dataset(optim, backend_sched, backend, shuffle);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+    {
+        std::pair<int, int> partial = test_grad(optim, backend_sched, backend);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+    for (bool high_level : {false, true}){
+        for (bool shuffle : {false, true}) {
+            if (!high_level && shuffle) {
+                continue;
+            }
+
+            std::pair<int, int> partial = test_forward_backward(optim, backend_sched, backend, high_level, shuffle);
+            npass += partial.first;
+            ntest += partial.second;
+        }
+    }
+    {
+      std::pair<int, int> partial = test_epoch_vs_fit(optim, backend_sched, backend);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+    for (bool high_level : {false, true}){
+        std::pair<int, int> partial = test_idata_split(optim, backend_sched, backend, high_level);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+    bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
+    if (adamw) {
+        for (int32_t nbatch_physical : { 2, 1 }) {
+            for (enum ggml_opt_loss_type loss_type : { GGML_OPT_LOSS_TYPE_SUM, GGML_OPT_LOSS_TYPE_MEAN }) {
+                std::pair<int, int> partial =
+                    test_gradient_accumulation(optim, backend_sched, backend, nbatch_physical, loss_type);
+                npass += partial.first;
+                ntest += partial.second;
+            }
+        }
+    }
+    {
+        std::pair<int, int> partial = test_regression(optim, backend_sched, backend);
+        npass += partial.first;
+        ntest += partial.second;
+    }
+
+    return std::make_pair(npass, ntest);
+}
+
+
+int main(void) {
+    ggml_log_set(nullptr, nullptr);
+    ggml_backend_load_all();
+    const size_t dev_count = ggml_backend_dev_count();
+    printf("Testing %zu devices\n\n", dev_count);
+    size_t n_ok = 0;
+
+    std::vector<ggml_backend_dev_t> devs;
+    std::vector<ggml_backend_t>     backends;
+
+    for (size_t i = 0; i < dev_count; ++i) {
+        devs.push_back(ggml_backend_dev_get(i));
+
+        ggml_backend_t backend = ggml_backend_dev_init(devs[i], NULL);
+        GGML_ASSERT(backend != NULL);
+
+        auto * reg = ggml_backend_dev_backend_reg(devs[i]);
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
+            ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency() / 2);
+        }
+        backends.push_back(backend);
+    }
+
+    size_t n_total = 0;
+    for (enum ggml_opt_optimizer_type optim : { GGML_OPT_OPTIMIZER_TYPE_ADAMW, GGML_OPT_OPTIMIZER_TYPE_SGD }) {
+        for (size_t i = 0; i < dev_count; ++i) {
+            // Put the backend to be tested in front so that it's prioritized:
+            std::vector<ggml_backend_t> backends_modded = { backends[i] };
+            backends_modded.insert(backends_modded.end(), backends.begin(), backends.end());
+
+            ggml_backend_sched_t backend_sched = ggml_backend_sched_new(
+                backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, true);
+
+            char const* devname = ggml_backend_dev_name(devs[i]);
+            printf("Backend %zu/%zu: %s\n", i + 1, dev_count, devname);
+            printf("  Device description: %s\n", ggml_backend_dev_description(devs[i]));
+            size_t free, total;  // NOLINT
+            ggml_backend_dev_memory(devs[i], &free, &total);
+            printf("  Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
+            printf("\n");
+
+            bool skip;
+            {
+                struct ggml_init_params params = {
+                    /*.mem_size   =*/ 6*ggml_tensor_overhead(),
+                    /*.mem_buffer =*/ nullptr,
+                    /*.no_alloc   =*/ true,
+                };
+                ggml_context * ctx = ggml_init(params);
+                ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+                ggml_set_param(a);
+                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+                ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+                ggml_tensor * d = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+
+                ggml_tensor * t = nullptr;
+                switch (optim) {
+                    case GGML_OPT_OPTIMIZER_TYPE_ADAMW: {
+                        ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 7);
+                        t = ggml_opt_step_adamw(ctx, a, b, c, d, p);
+                    } break;
+                    case GGML_OPT_OPTIMIZER_TYPE_SGD: {
+                        ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
+                        t = ggml_opt_step_sgd(ctx, a, b, p);
+                    } break;
+                    case GGML_OPT_OPTIMIZER_TYPE_COUNT: {
+                        GGML_ABORT("fatal error");
+                    }
+                }
+                skip = !ggml_backend_supports_op(backends[i], t);
+                ggml_free(ctx);
+            }
+
+            std::pair<int, int> result;
+            if (!skip) {
+                result = test_backend(backend_sched, backends[i], optim);
+                printf("  %d/%d tests passed\n", result.first, result.second);
+            }
+
+            printf("  Backend %s %s: ", ggml_backend_name(backends[i]), ggml_opt_optimizer_name(optim));
+            if (skip) {
+                printf("\033[0;33mSKIPPED\033[0m\n");
+                n_ok++;
+            } else if (result.first == result.second) {
+                printf("\033[1;32mOK\033[0m\n");
+                n_ok++;
+            } else {
+                printf("\033[1;31mFAIL\033[0m\n");
+            }
+            ++n_total;
+            printf("\n");
+            ggml_backend_sched_free(backend_sched);
+        }
+    }
+
+    for (ggml_backend_t backend : backends) {
+        ggml_backend_free(backend);
+    }
+
+    printf("%zu/%zu backend*optimizer passed\n", n_ok, n_total);
+    bool ok = n_ok == n_total;
+    print_ok(ok);
+    return ok ? 0 : 1;
+}
diff --git a/llama.cpp/tests/test-peg-parser.cpp b/llama.cpp/tests/test-peg-parser.cpp
new file mode 100644
index 0000000..220745d
--- /dev/null
+++ b/llama.cpp/tests/test-peg-parser.cpp
@@ -0,0 +1,25 @@
+#include <cstdlib>
+#include <string>
+#include <iostream>
+
+#include "peg-parser/tests.h"
+
+int main(int argc, char *argv[]) {
+    testing t(std::cout);
+    if (argc >= 2) {
+        t.set_filter(argv[1]);
+    }
+
+    const char * verbose = getenv("LLAMA_TEST_VERBOSE");
+    if (verbose) {
+        t.verbose = std::string(verbose) == "1";
+    }
+
+    t.test("basic", test_basic);
+    t.test("unicode", test_unicode);
+    t.test("json", test_json_parser);
+    t.test("gbnf", test_gbnf_generation);
+    t.test("serialization", test_json_serialization);
+
+    return t.summary();
+}
diff --git a/llama.cpp/tests/test-quantize-fns.cpp b/llama.cpp/tests/test-quantize-fns.cpp
new file mode 100644
index 0000000..037c058
--- /dev/null
+++ b/llama.cpp/tests/test-quantize-fns.cpp
@@ -0,0 +1,186 @@
+// Unit tests for quantization specific functions - quantize, dequantize and dot product
+
+#include "ggml.h"
+#include "ggml-cpu.h"
+
+#undef NDEBUG
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.01f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
+constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
+constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
+constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.15f;
+
+static const char* RESULT_STR[] = {"ok", "FAILED"};
+
+
+// Generate synthetic data
+static void generate_data(float offset, size_t n, float * dst) {
+    for (size_t i = 0; i < n; i++) {
+        dst[i] = 0.1 + 2*cosf(i + offset);
+    }
+}
+
+// Calculate RMSE between two float arrays
+static float array_rmse(const float * a1, const float * a2, size_t n) {
+    double sum = 0;
+    for (size_t i = 0; i < n; i++) {
+        double diff = a1[i] - a2[i];
+        sum += diff * diff;
+    }
+    return sqrtf(sum) / n;
+}
+
+// Total quantization error on test data
+static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
+    std::vector<uint8_t> tmp_q(2*test_size);
+    std::vector<float> tmp_out(test_size);
+
+    qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
+    return array_rmse(test_data, tmp_out.data(), test_size);
+}
+
+// Total quantization error on test data
+static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
+    std::vector<uint8_t> tmp_q(2*test_size);
+    std::vector<float> tmp_out(test_size);
+    std::vector<float> tmp_out_ref(test_size);
+
+    // FIXME: why is done twice?
+    qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
+
+    qfns->from_float_ref(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
+
+    return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
+}
+
+static float dot_product(const float * a1, const float * a2, size_t test_size) {
+    double sum = 0;
+    for (size_t i = 0; i < test_size; i++) {
+        sum += a1[i] * a2[i];
+    }
+    return sum;
+}
+
+// Total dot product error
+static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
+    GGML_UNUSED(qfns);
+
+    std::vector<uint8_t> tmp_q1(2*test_size);
+    std::vector<uint8_t> tmp_q2(2*test_size);
+
+    const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
+
+    qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size);
+    vdot->from_float(test_data2, tmp_q2.data(), test_size);
+
+    float result = INFINITY;
+    qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
+
+    const float dot_ref = dot_product(test_data1, test_data2, test_size);
+
+    return fabsf(result - dot_ref) / test_size;
+}
+
+int main(int argc, char * argv[]) {
+    bool verbose = false;
+    const size_t test_size = 32 * 128;
+
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-v") {
+            verbose = true;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+
+    std::vector<float> test_data(test_size);
+    std::vector<float> test_data2(test_size);
+
+    generate_data(0.0, test_data.size(), test_data.data());
+    generate_data(1.0, test_data2.size(), test_data2.data());
+
+    ggml_cpu_init();
+
+    int num_failed = 0;
+    bool failed = false;
+
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        const auto * qfns = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
+
+        // deprecated - skip
+        if (qfns->blck_size == 0) {
+            continue;
+        }
+
+        const ggml_type ei = (ggml_type)i;
+
+        printf("Testing %s\n", ggml_type_name((ggml_type) i));
+        ggml_quantize_init(ei);
+
+        if (qfns_cpu->from_float && qfns->to_float) {
+            const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
+            const float max_quantization_error =
+                type == GGML_TYPE_TQ1_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
+                type == GGML_TYPE_TQ2_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
+                type == GGML_TYPE_Q2_K    ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
+                type == GGML_TYPE_IQ2_S   ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
+                type == GGML_TYPE_Q3_K    ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
+                type == GGML_TYPE_IQ3_S   ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
+                type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
+            failed = !(total_error < max_quantization_error);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s absolute quantization error:    %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
+            }
+
+            const float reference_error = reference_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
+            failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
+            }
+
+            const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
+            const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
+                                            type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
+                                          ? MAX_DOT_PRODUCT_ERROR_LOWBIT
+                                          : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
+                                          ? MAX_DOT_PRODUCT_ERROR_TERNARY
+                                          : MAX_DOT_PRODUCT_ERROR;
+            failed = !(vec_dot_error < max_allowed_error);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s dot product error:              %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
+            }
+        }
+    }
+
+    if (num_failed || verbose) {
+        printf("%d tests failed\n", num_failed);
+    }
+
+    return num_failed > 0;
+}
diff --git a/llama.cpp/tests/test-quantize-perf.cpp b/llama.cpp/tests/test-quantize-perf.cpp
new file mode 100644
index 0000000..cac0782
--- /dev/null
+++ b/llama.cpp/tests/test-quantize-perf.cpp
@@ -0,0 +1,356 @@
+// Benchmark quantization specific functions on synthetic data
+
+#include "ggml.h"
+#include "ggml-cpu.h"
+
+#undef NDEBUG
+#include <algorithm>
+#include <assert.h>
+#include <functional>
+#include <math.h>
+#include <memory>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#define MAX_ALIGNMENT 64
+#define QK 32
+#define WARMUP 5
+#define ITERATIONS 10
+#define MAX_ITERATIONS 100000000
+
+#define L1_SIZE      32*128
+#define L2_SIZE     32*2048
+#define L3_SIZE    32*20480
+#define MEM_SIZE 32*2048000
+
+struct quantize_perf_params {
+    std::vector<std::string> include_types;
+    std::vector<size_t> test_sizes;
+    size_t alignment_offset = 0;
+    bool op_quantize_row_q_reference = false;
+    bool op_quantize_row_q = false;
+    bool op_dequantize_row_q = false;
+    bool op_quantize_row_q_dot = false;
+    bool op_vec_dot_q = false;
+    int64_t iterations = ITERATIONS;
+};
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#include <x86intrin.h>
+inline int64_t cpu_cycles() {
+// Rough way to detect new-ish CPUs
+#ifdef __POPCNT__
+    unsigned int dummy;
+    return __rdtscp(&dummy);
+#else
+    return __rdtsc();
+#endif
+}
+
+#else
+
+#define cpu_cycles() 0
+
+#endif
+
+
+// Generate synthetic data
+static void generate_data(float offset, size_t n, float * dst) {
+    for (size_t i = 0; i < n; i++) {
+        dst[i] = 0.1 + 2*cosf(i + offset);
+    }
+}
+
+static float gigabytes_per_second(size_t bytes, int64_t usecs) {
+    return bytes / (float) usecs * 1000000 / (1024*1024*1024);
+}
+
+static void * align_with_offset(void * ptr, int offset) {
+    size_t dummy_size = MAX_ALIGNMENT * 4;
+    return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
+}
+
+static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
+    int64_t min_time_us = INT64_MAX;
+    int64_t total_time_us = 0;
+    int64_t min_time_cycles = INT64_MAX;
+    int64_t total_time_cycles = 0;
+
+    for (int i = 0; i < WARMUP; i++) {
+        func();
+    }
+
+    for (int i = 0; i < iterations; i++) {
+        const int64_t start_time = ggml_time_us();
+        const int64_t start_cycles = cpu_cycles();
+
+        func();
+
+        const int64_t end_cycles = cpu_cycles();
+        const int64_t end_time = ggml_time_us();
+
+        total_time_cycles += end_cycles - start_cycles;
+        min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
+        total_time_us += end_time - start_time;
+        min_time_us = std::min(min_time_us, end_time - start_time);
+    }
+
+    printf("      min cycles/%d vals   : %9.2f\n",  QK, QK * min_time_cycles / (float) size);
+    printf("      avg cycles/%d vals   : %9.2f\n",  QK, QK * total_time_cycles / (float) (size * iterations));
+    printf("      float32 throughput   : %9.2f GB/s\n",  gigabytes_per_second(4 * size * iterations, total_time_us));
+    printf("      quantized throughput : %9.2f GB/s\n",  gigabytes_per_second(q_size * iterations, total_time_us));
+}
+
+static void usage(char * argv[]) {
+    printf("Benchmark quantization specific functions on synthetic data\n");
+    printf("\n");
+    printf("usage: %s [options]\n", argv[0]);
+    printf("\n");
+    printf("options: (default)\n");
+    printf("  -h, --help            show this help message and exit\n");
+    printf("  --size SIZE           set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
+    printf("  -3                    use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
+    printf("  -4                    use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
+    printf("  --op OP               set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
+    printf("                        quantize_row_q_dot, vec_dot_q (all)\n");
+    printf("  --type TYPE           set test type as");
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        const auto * qfns     = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
+        if (ggml_type_name(type) != NULL) {
+            if (qfns_cpu->from_float && qfns->to_float) {
+                printf(" %s", ggml_type_name(type));
+            }
+        }
+    }
+    printf(" (all)\n");
+    printf("  --alignment-offset OFFSET\n");
+    printf("                        set alignment offset as OFFSET (0)\n");
+    printf("  -i NUM, --iterations NUM\n");
+    printf("                        set test iteration number (%d)\n", ITERATIONS);
+}
+
+int main(int argc, char * argv[]) {
+    quantize_perf_params params {};
+
+    // read command line
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "--size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            size_t size = std::stoi(argv[i]);
+            if (size % 32 != 0) {
+                fprintf(stderr, "error: size %zu not divisible by 32\n", size);
+                invalid_param = true;
+                break;
+            }
+            params.test_sizes.push_back(size);
+        } else if (arg == "-3") {
+            // quick select sizes that probably fit in CPU caches
+            params.test_sizes.push_back(L1_SIZE);
+            params.test_sizes.push_back(L2_SIZE);
+            params.test_sizes.push_back(L3_SIZE);
+        } else if (arg == "-4") {
+            // quick select cache sizes + memory
+            params.test_sizes.push_back(L1_SIZE);
+            params.test_sizes.push_back(L2_SIZE);
+            params.test_sizes.push_back(L3_SIZE);
+            params.test_sizes.push_back(MEM_SIZE);
+        } else if (arg == "--op") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string op {argv[i]};
+            if (op == "quantize_row_q_reference") {
+                params.op_quantize_row_q_reference = true;
+            } else if (op == "quantize_row_q") {
+                params.op_quantize_row_q = true;
+            } else if (op == "dequantize_row_q") {
+                params.op_dequantize_row_q = true;
+            } else if (op == "quantize_row_q_dot") {
+                params.op_quantize_row_q_dot = true;
+            } else if (op == "vec_dot_q") {
+                params.op_vec_dot_q = true;
+            } else {
+                invalid_param = true;
+                break;
+            }
+        } else if (arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_types.push_back(argv[i]);
+        } else if (arg == "--alignment-offset") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int alignment = std::stoi(argv[i]);
+            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
+            fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
+                invalid_param = true;
+                break;
+            }
+            params.alignment_offset = alignment;
+        } else if ((arg == "-i") || (arg == "--iterations")) {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int number = std::stoi(argv[i]);
+            if (number < 0 || number > MAX_ITERATIONS) {
+            fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS);
+                invalid_param = true;
+                break;
+            }
+            params.iterations = number;
+        } else if ((arg == "-h") || (arg == "--help")) {
+            usage(argv);
+            return 1;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        return 1;
+    }
+
+    if (params.test_sizes.empty()) {
+        params.test_sizes.push_back(L1_SIZE);
+    }
+    if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
+        params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
+    }
+
+    std::sort(params.test_sizes.begin(), params.test_sizes.end());
+    size_t largest = params.test_sizes.back();
+
+    std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q1_v   (largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q2_v   (largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_out_v  (largest*4 + MAX_ALIGNMENT*2);
+
+    float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
+    float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
+    float * test_q1    = (float *) align_with_offset(test_q1_v.data(),    params.alignment_offset);
+    float * test_q2    = (float *) align_with_offset(test_q2_v.data(),    params.alignment_offset);
+    float * test_out   = (float *) align_with_offset(test_out_v.data(),   params.alignment_offset);
+
+    generate_data(0, largest, test_data1);
+    generate_data(1, largest, test_data2);
+
+    int64_t iterations = params.iterations;
+
+    ggml_cpu_init();
+
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        const auto * qfns = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
+        if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
+            continue;
+        }
+
+        if (qfns_cpu->from_float && qfns->to_float) {
+            printf("%s\n", ggml_type_name(type));
+
+            ggml_quantize_init(type);
+
+            if (params.op_quantize_row_q_reference) {
+                printf("  quantize_row_q_reference\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        qfns->from_float_ref(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = ggml_row_size(type, size);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_quantize_row_q) {
+                printf("  quantize_row_q\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        qfns_cpu->from_float(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = ggml_row_size(type, size);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_dequantize_row_q) {
+                printf("  dequantize_row_q\n");
+                qfns_cpu->from_float(test_data1, test_q1, largest);
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        qfns->to_float(test_q1, test_out, size);
+                        return test_out[0];
+                    };
+                    size_t quantized_size = ggml_row_size(type, size);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_quantize_row_q_dot) {
+                printf("  quantize_row_q_dot\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
+                        vdot->from_float(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = ggml_row_size(type, size);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_vec_dot_q) {
+                printf("  vec_dot_q\n");
+                qfns_cpu->from_float(test_data1, test_q1, largest);
+                qfns_cpu->from_float(test_data2, test_q2, largest);
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        float result;
+                        qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
+                        return result;
+                    };
+                    size_t quantized_size = ggml_row_size(type, size);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+        }
+    }
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-quantize-stats.cpp b/llama.cpp/tests/test-quantize-stats.cpp
new file mode 100644
index 0000000..de587d4
--- /dev/null
+++ b/llama.cpp/tests/test-quantize-stats.cpp
@@ -0,0 +1,424 @@
+#include "ggml.h"
+#include "ggml-cpu.h"
+#include "llama.h"
+#include "common.h"
+
+#include "../src/llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <numeric>
+#include <regex>
+#include <string>
+#include <vector>
+#include <thread>
+#include <mutex>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+struct quantize_stats_params {
+    std::string model = "models/7B/ggml-model-f16.gguf";
+    bool verbose = false;
+    bool per_layer_stats = false;
+    bool print_histogram = false;
+    bool reference = false;
+    std::vector<std::string> include_layers;
+    std::vector<std::string> exclude_layers;
+    std::vector<enum ggml_type> include_types;
+};
+
+constexpr size_t HISTOGRAM_BUCKETS = 150;
+constexpr double HISTOGRAM_RANGE = 0.03;
+
+struct error_stats {
+    size_t num_samples;
+    double total_error;
+    double max_error;
+    uint64_t error_histogram[HISTOGRAM_BUCKETS];
+};
+
+static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
+    quantize_stats_params params;
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -r, --reference\n");
+    fprintf(stderr, "                        use reference implementation (default: false)\n");
+    fprintf(stderr, "  -v, --verbose\n");
+    fprintf(stderr, "                        verbose output (default: false)\n");
+    fprintf(stderr, "  -p, --per-layer-stats\n");
+    fprintf(stderr, "                        print stats per layer (default: false)\n");
+    fprintf(stderr, "  --histogram\n");
+    fprintf(stderr, "                        print error histogram (default: false)\n");
+    fprintf(stderr, "  -l LAYER, --include-layer LAYER\n");
+    fprintf(stderr, "                        only test layers matching pattern\n");
+    fprintf(stderr, "  -L LAYER, --exclude-layer LAYER\n");
+    fprintf(stderr, "                        exclude layers matching pattern\n");
+    fprintf(stderr, "  -t TYPE, --type TYPE\n");
+    fprintf(stderr, "                        only test given type (q4_0, q4_1)\n");
+    fprintf(stderr, "\n");
+}
+
+// Check if a layer is included/excluded by command line
+static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
+    for (const auto& excluded : params.exclude_layers) {
+        if (std::regex_search(layer, std::regex(excluded))) {
+            return false;
+        }
+    }
+    for (const auto& included : params.include_layers) {
+        if (std::regex_search(layer, std::regex(included))) {
+            return true;
+        }
+    }
+    return params.include_layers.empty();
+}
+
+// Update error statistics given vectors with the before/after result of quantization
+static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
+    for (int64_t i = 0; i < nelements; i++) {
+        double diff = input[i] - output[i];
+        stats.total_error += diff * diff;
+        stats.max_error = fmax(fabs(diff), stats.max_error);
+        stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
+    }
+    stats.num_samples += nelements;
+}
+
+static void combine_error_stats(error_stats & into, const error_stats & from) {
+    into.num_samples += from.num_samples;
+    into.total_error += from.total_error;
+    if (from.max_error > into.max_error) into.max_error = from.max_error;
+    for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
+}
+
+static double find_quantile(const error_stats & stats, double quantile) {
+    double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
+
+    double accum = 0;
+    for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+        accum += stats.error_histogram[i];
+        if (accum >= sum*quantile) {
+            return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+        }
+    }
+    return INFINITY;
+}
+
+static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+    double rmse = sqrt(stats.total_error / (double) stats.num_samples);
+    double median = find_quantile(stats, .5);
+    double pct95 = find_quantile(stats, .95);
+    printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
+    if (print_histogram) {
+        printf("Error distribution:\n");
+        for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+            double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
+            printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
+        }
+    }
+}
+
+// copied from ggml.h - verify that we can access this as a flat array
+static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+static void test_roundtrip_on_chunk(
+    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
+    float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
+) {
+    if (layer->type == GGML_TYPE_F16) {
+        for (int i = 0; i < chunk_size; i++) {
+            input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
+        }
+    } else {
+        input_scratch = ggml_get_data_f32(layer) + offset;
+    }
+
+    if (use_reference) {
+        qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
+    } else {
+        qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
+    }
+    qfns.to_float(quantized_scratch, output_scratch, chunk_size);
+
+    update_error_stats(chunk_size, input_scratch, output_scratch, stats);
+}
+
+
+// Run quantization function for a single layer and update error stats
+static void test_roundtrip_on_layer(
+    std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
+    const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
+    std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
+) {
+    assert(tensor_is_contiguous(layer));
+    error_stats layer_error {};
+    uint64_t nelements = ggml_nelements(layer);
+
+    float* input_scratch_ptr = nullptr;
+    if (layer->type == GGML_TYPE_F16) {
+        if (input_scratch.size() < nelements) input_scratch.resize(nelements);
+        input_scratch_ptr = input_scratch.data();
+    }
+    if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
+    if (output_scratch.size() < nelements) output_scratch.resize(nelements);
+
+    if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
+    int chunk_size = 32*512;
+    int num_chunks = (nelements + chunk_size - 1)/chunk_size;
+
+    if (num_chunks < 2 || max_thread < 2) {
+        test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
+                output_scratch.data(), print_layer_stats ? layer_error : total_error);
+    } else {
+        auto & stats = print_layer_stats ? layer_error : total_error;
+        std::mutex mutex;
+        uint64_t counter = 0;
+        auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
+             &quantized_scratch, &output_scratch, chunk_size] () {
+            error_stats local_stats {};
+            while (true) {
+                std::unique_lock<std::mutex> lock(mutex);
+                uint64_t offset = counter; counter += chunk_size;
+                if (offset >= nelements) {
+                    combine_error_stats(stats, local_stats);
+                    break;
+                }
+                lock.unlock();
+                uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
+                test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
+                        quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
+            }
+        };
+        int nthread = std::min(num_chunks, max_thread);
+        std::vector<std::thread> workers(nthread-1);
+        for (auto& w : workers) w = std::thread(compute);
+        compute();
+        for (auto& w : workers) w.join();
+    }
+
+    if (print_layer_stats) {
+        print_error_stats(name, layer_error, false);
+        combine_error_stats(total_error, layer_error);
+    }
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    quantize_stats_params params;
+
+    // read command line
+
+    int max_thread = 0;
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            quantize_stats_print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-r" || arg == "--reference") {
+            params.reference = true;
+        } else if (arg == "-v") {
+            params.verbose = true;
+        } else if (arg == "-p" || arg == "--per-layer-stats") {
+            params.per_layer_stats = true;
+        } else if (arg == "--histogram") {
+            params.print_histogram = true;
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model = argv[i];
+        } else if (arg == "-l" || arg == "--include-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_layers.emplace_back(argv[i]);
+        } else if (arg == "-L" || arg == "--exclude-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.exclude_layers.emplace_back(argv[i]);
+        } else if (arg == "-t" || arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int j;
+            for (j = 0; j < GGML_TYPE_COUNT; ++j) {
+               const auto * name = ggml_type_name((ggml_type) j);
+               if (name && strcmp(argv[i], name) == 0) break;
+            }
+            if (j < GGML_TYPE_COUNT) {
+                params.include_types.push_back((ggml_type) j);
+            } else {
+                fprintf(stderr, "error: %s not in list of types\n", argv[i]);
+                invalid_param = true;
+            }
+        } else if (arg == "-n" || arg == "--num-threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            max_thread = atoi(argv[i]);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            quantize_stats_print_usage(argc, argv);
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        quantize_stats_print_usage(argc, argv);
+        return 1;
+    }
+
+    print_build_info();
+
+    // load the model
+    fprintf(stderr, "Loading model\n");
+
+    const int64_t t_main_start_us = ggml_time_us();
+    llama_model * model;
+    llama_context * ctx;
+
+    {
+        auto mparams = llama_model_default_params();
+        mparams.use_mlock  = false;
+
+        model = llama_model_load_from_file(params.model.c_str(), mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        auto cparams = llama_context_default_params();
+        cparams.n_ctx = 256;
+
+        ctx = llama_init_from_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+            llama_model_free(model);
+            return 1;
+        }
+    }
+
+    const auto & tensors = llama_internal_get_tensor_map(model);
+
+    // check layer tensors
+    int included_layers = 0;
+    int64_t max_nelements = 0;
+    bool is_f16 = false;
+    for (const auto & kv_tensor : tensors) {
+        if (!layer_included(params, kv_tensor.first)) {
+            continue;
+        }
+        if (params.verbose) {
+            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
+        }
+        if (kv_tensor.second->type == GGML_TYPE_F16) {
+            is_f16 = true;
+        } else if (kv_tensor.second->type != GGML_TYPE_F32) {
+            fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
+                "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
+            llama_free(ctx);
+            llama_model_free(model);
+            return 1;
+        }
+        included_layers++;
+        max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
+    }
+
+    if (is_f16) {
+        printf("note: source model is f16\n");
+    }
+    printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
+    // allocate scratch space
+    std::vector<float> input_scratch;
+    std::vector<char> quantized_scratch;
+    std::vector<float> output_scratch;
+
+    // loop throught quantization types
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        const ggml_type type = (ggml_type) i;
+        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
+            continue;
+        }
+        const auto * qfns     = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
+        if (qfns_cpu->from_float && qfns->to_float) {
+            if (params.verbose) {
+                printf("testing %s ...\n",  ggml_type_name(type));
+            }
+
+            ggml_quantize_init(type);
+
+            error_stats global_stats {};
+
+            for (const auto & kv_tensor : tensors) {
+                if (!layer_included(params, kv_tensor.first)) {
+                    continue;
+                }
+                if (params.verbose) {
+                    printf("  %s ...\n",  kv_tensor.first.c_str());
+                }
+                std::string layer_name { ggml_type_name(type) };
+                layer_name += "::" + kv_tensor.first;
+                test_roundtrip_on_layer(
+                        layer_name,
+                        params.per_layer_stats,
+                        *qfns, *qfns_cpu,
+                        params.reference,
+                        kv_tensor.second,
+                        input_scratch,
+                        quantized_scratch,
+                        output_scratch,
+                        global_stats,
+                        max_thread
+                );
+            }
+
+            print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
+        }
+    }
+
+
+    llama_free(ctx);
+    llama_model_free(model);
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
+    }
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-regex-partial.cpp b/llama.cpp/tests/test-regex-partial.cpp
new file mode 100644
index 0000000..70af6d7
--- /dev/null
+++ b/llama.cpp/tests/test-regex-partial.cpp
@@ -0,0 +1,288 @@
+//  Tests common_regex (esp. its partial final matches support).
+
+#include "common.h"
+#include "regex-partial.h"
+
+#include <sstream>
+#include <iostream>
+#include <optional>
+
+template <class T> static void assert_equals(const T & expected, const T & actual) {
+    if (expected != actual) {
+        std::cerr << "Expected: " << expected << std::endl;
+        std::cerr << "  Actual: " << actual << std::endl;
+        std::cerr << std::flush;
+        throw std::runtime_error("Test failed");
+    }
+}
+
+struct test_case {
+    std::string pattern;
+    struct input_output {
+        std::string input;
+        common_regex_match output;
+    };
+    std::vector<input_output> inputs_outputs;
+};
+
+static std::string common_regex_match_type_name(common_regex_match_type type) {
+    switch (type) {
+        case COMMON_REGEX_MATCH_TYPE_NONE:
+            return "COMMON_REGEX_MATCH_TYPE_NONE";
+        case COMMON_REGEX_MATCH_TYPE_PARTIAL:
+            return "COMMON_REGEX_MATCH_TYPE_PARTIAL";
+        case COMMON_REGEX_MATCH_TYPE_FULL:
+            return "COMMON_REGEX_MATCH_TYPE_FULL";
+    }
+    return "?";
+}
+
+static void test_regex() {
+    printf("[%s]\n", __func__);
+    auto test = [](const test_case & test_case) {
+        common_regex cr(test_case.pattern);
+        std::cout << "Testing pattern: /" << test_case.pattern << "/\n";
+        // std::cout << "    partial rev: " << cr.reversed_partial_pattern.str() << '\n';
+        for (const auto & input_output : test_case.inputs_outputs) {
+            std::cout << "  Input: " << input_output.input << '\n';
+            auto m = cr.search(input_output.input, 0);
+            if (m != input_output.output) {
+                auto match_to_str = [&](const std::optional<common_regex_match> & m) {
+                    std::ostringstream ss;
+                    if (m->type == COMMON_REGEX_MATCH_TYPE_NONE) {
+                        ss << "<no match>";
+                    } else {
+                        GGML_ASSERT(!input_output.output.groups.empty());
+                        std::vector<std::string> parts;
+                        for (const auto & g : m->groups) {
+                            parts.push_back("{" + std::to_string(g.begin) + ", " + std::to_string(g.end) + "}");
+                        }
+                        ss << "{" << common_regex_match_type_name(m->type) << ", {" << string_join(parts, ", ") << "}}";
+                    }
+                    return ss.str();
+                };
+                std::cout << "    Expected: " << match_to_str(input_output.output) << '\n';
+                std::cout << "         Got: " << match_to_str(m) << '\n';
+                std::cout << " Inverted pattern: /" << regex_to_reversed_partial_regex(test_case.pattern) << "/\n";
+
+                throw std::runtime_error("Test failed");
+            }
+        }
+    };
+    test({
+        "a",
+        {
+            {"a", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
+            {"b", {COMMON_REGEX_MATCH_TYPE_NONE, {}}},
+            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
+            {"ba", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 2}}}},
+        }
+    });
+    test({
+        "abcd",
+        {
+            {"abcd", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
+            {"abcde", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
+            {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
+            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
+            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
+            {"d", {}},
+            {"bcd", {}},
+            {"cde", {}},
+            {"cd", {}},
+            {"yeah ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{5, 7}}}},
+            {"abbie", {}},
+            {"", {}},
+        }
+    });
+    test({
+        ".*?ab",
+        {
+            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
+            {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
+            {"dab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
+            {"dabc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
+            {"da", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
+            {"d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
+        }
+    });
+    test({
+        "a.*?b",
+        {
+            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
+            {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
+            {"a b", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
+            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
+            {"argh", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
+            {"d", {}},
+            {"b", {}},
+        }
+    });
+    test({
+        "ab(?:cd){2,4}ef",
+        {
+            // {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, 0, {}}},
+            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
+            {"abcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
+            {"abcde", {}},
+            {"abcdef", {}},
+            {"abcdcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
+            {"abcdcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 7}}}},
+            {"abcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
+            {"abcdcdcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 12}}}},
+            {"abcdcdcdcdcdef", {}},
+            {"abcde", {}},
+            {"yea", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{2, 3}}}},
+        }
+    });
+    test({
+        "a(?:rte| pure )fact",
+        {
+            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
+            {"art", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
+            {"artefa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
+            {"fact", {}},
+            {"an arte", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{3, 7}}}},
+            {"artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
+            {"an artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{3, 11}}}},
+            {"a pure", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
+            {"a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 11}}}},
+            {"it's a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{5, 16}}}},
+            {"" , {}},
+            {"pure", {}},
+            {"pure fact", {}},
+        }
+    });
+    test({
+        "abc",
+        {
+            {" abcc", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 4}}}},
+            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
+            {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
+            {" ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{1, 3}}}},
+            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
+            {"b", {}},
+            {"c", {}},
+            {"", {}},
+        }
+    });
+
+    test({
+        "(?:abc)?\\s*def",
+        {
+            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
+            {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
+            {"abc ", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
+            {"abc d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
+            {"abc de", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
+            {"abc def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
+            {"abc defg", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
+            {"abc defgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
+            {"abcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
+            {"abcdefgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 6}}}},
+            {" d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
+            {"def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
+        }
+    });
+
+    test({
+        "a+b",
+        {
+            {"aaab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
+            {"aaa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
+            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
+        }
+    });
+
+    test({
+        "(?:"
+            "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
+            "("                          // match 2 (open_tag)
+                "<tool_call>"
+                "|<function_call>"
+                "|<tool>"
+                "|<tools>"
+                "|<response>"
+                "|<json>"
+                "|<xml>"
+                "|<JSON>"
+            ")?"
+            "(\\s*\\{\\s*\"name\"\\s*:)" // match 3 (named tool call)
+        ")"
+        "|<function=([^>]+)>"            // match 4 (function name)
+        "|<function name=\"([^\"]+)\">", // match 5 (function name again)
+        {
+            {"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}, {54, 54}, {54, 54}, {0, 8}, {54, 54}, {54, 54}}}},
+            {"<tool_call> {\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 18}}}},
+            {"<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 17}}}},
+            {"Let's call something\n<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{21, 38}}}},
+            {"Ok then<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 24}}}},
+            {"{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
+            {"Ok then{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 13}}}},
+            {"<tool_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 20}, {66, 66}, {0, 11}, {11, 20}, {66, 66}, {66, 66}}}},
+            {"<function_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 24}, {70, 70}, {0, 15}, {15, 24}, {70, 70}, {70, 70}}}},
+            {"<function name=\"special_function\"> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 34}, {89, 89}, {89, 89}, {89, 89}, {89, 89}, {16, 32}}}},
+            {"<function=all>", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 14}, {14, 14}, {14, 14}, {14, 14}, {10, 13}, {14, 14}}}},
+
+        }
+    });
+}
+
+static void test_regex_to_reversed_partial_regex() {
+    printf("[%s]\n", __func__);
+
+    assert_equals<std::string>(
+        "^((?:(?:c)?b)?a)",
+        regex_to_reversed_partial_regex("abc"));
+
+    assert_equals<std::string>(
+        "^(a+)",
+        regex_to_reversed_partial_regex("a+"));
+
+    assert_equals<std::string>(
+        "^(a*)",
+        regex_to_reversed_partial_regex("a*"));
+
+    assert_equals<std::string>(
+        "^(a?)",
+        regex_to_reversed_partial_regex("a?"));
+
+    assert_equals<std::string>(
+        "^([a-z])",
+        regex_to_reversed_partial_regex("[a-z]"));
+
+    assert_equals<std::string>(
+        "^((?:\\w+)?[a-z])",
+        regex_to_reversed_partial_regex("[a-z]\\w+"));
+
+    assert_equals<std::string>(
+        "^((?:a|b))",
+        regex_to_reversed_partial_regex("(?:a|b)"));
+    assert_equals<std::string>(
+        "^((?:(?:(?:d)?c)?b)?a)",
+        regex_to_reversed_partial_regex("abcd"));
+    assert_equals<std::string>(
+        "^((?:b)?a*)", // TODO: ((?:b)?a*+).* ??
+        regex_to_reversed_partial_regex("a*b"));
+    assert_equals<std::string>(
+        "^((?:(?:b)?a)?.*)",
+        regex_to_reversed_partial_regex(".*?ab"));
+    assert_equals<std::string>(
+        "^((?:(?:b)?.*)?a)",
+        regex_to_reversed_partial_regex("a.*?b"));
+    assert_equals<std::string>(
+        "^((?:(?:d)?(?:(?:c)?b))?a)",
+        regex_to_reversed_partial_regex("a(bc)d"));
+    assert_equals<std::string>(
+        "^((?:(?:(?:c)?b|(?:e)?d))?a)",
+        regex_to_reversed_partial_regex("a(bc|de)"));
+    assert_equals<std::string>(
+        "^((?:(?:(?:(?:(?:c)?b?)?b?)?b)?b)?a)",
+        regex_to_reversed_partial_regex("ab{2,4}c"));
+}
+
+int main() {
+    test_regex_to_reversed_partial_regex();
+    test_regex();
+    std::cout << "All tests passed.\n";
+}
diff --git a/llama.cpp/tests/test-rope.cpp b/llama.cpp/tests/test-rope.cpp
new file mode 100644
index 0000000..801e4cd
--- /dev/null
+++ b/llama.cpp/tests/test-rope.cpp
@@ -0,0 +1,263 @@
+#include "ggml.h"
+#include "ggml-cpu.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
+
+#define MAX_NARGS 3
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define GGML_SILU_FP16
+
+//
+// logging
+//
+
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+static float frand(void) {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+static int irand(int n) {
+    if (n == 0) return 0;
+    return rand()%n;
+}
+
+static void get_random_dims(int64_t * dims, int ndims) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = 1 + irand(4);
+    }
+}
+
+static struct ggml_tensor * get_random_tensor_f32(
+        struct ggml_context * ctx0,
+        int ndims,
+        const int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
+int main(int /*argc*/, const char ** /*argv*/) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 128*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    std::vector<uint8_t> work_buffer;
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_tensor * x;
+
+    // rope f32
+    for (int m = 0; m < 5; ++m) {
+        const int ndims = 4;
+
+        const int64_t n_rot = 128;
+        const int64_t ne[4] = { 2*n_rot, 32, 73, 1 };
+
+        const int n_past_0 = 100;
+        const int n_past_2 = 33;
+
+        struct ggml_tensor * r0;
+        struct ggml_tensor * r1;
+        struct ggml_tensor * r2;
+        x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+        int mode = -1;
+
+        if (m < 2) {
+            struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+            struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+            struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+
+            for (int i = 0; i < ne[2]; ++i) {
+                ((int32_t *) p0->data)[i] = n_past_0 + i;
+                ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
+                ((int32_t *) p2->data)[i] = n_past_2 + i;
+            }
+            // test mode 0, 2  (standard, GPT-NeoX)
+            mode = m == 0 ? GGML_ROPE_TYPE_NORMAL : GGML_ROPE_TYPE_NEOX;
+
+            // 100, 101, 102, ..., 172
+            r0 = ggml_rope(ctx0, x,  p0, n_rot, mode);
+            // -67, -67, -67, ..., -67
+            r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+
+            //  33,  34,  35, ..., 105
+            r2 = ggml_rope(ctx0, x,  p2, n_rot, mode);
+        } else {
+            // testing multi-dimension rope position embedding mode
+            struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+            struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+            struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+
+            int sections[4] = {16, 24, 24, 0};
+
+            mode = (m == 2) ? GGML_ROPE_TYPE_MROPE : (m == 3) ? GGML_ROPE_TYPE_VISION : GGML_ROPE_TYPE_IMROPE;
+
+            for (int i = 0; i < ne[2]; ++i) {
+                for (int j = 0; j < 4; ++j) {
+                    ((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
+                    ((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
+                    ((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
+                }
+            }
+
+            // [[100, 101, 102, ..., 172],
+            // [101, 102, 103, ..., 173],
+            // [102, 103, 104, ..., 174]]
+            r0 = ggml_rope_multi(
+                ctx0, x, p0, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+            // [[-67, -67, -67, ..., -67]
+            // [-67, -67, -67, ..., -67]
+            // [-67, -67, -67, ..., -67]]
+            r1 = ggml_rope_multi(
+                ctx0, r0, p1, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+
+            //  [[33,  34,  35, ..., 105]
+            //  [34,  35,  36, ..., 106]
+            //  [35,  36,  37, ..., 107]]
+            r2 = ggml_rope_multi(
+                ctx0, x, p2, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+        }
+
+        ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        ggml_build_forward_expand(gf, r0);
+        ggml_build_forward_expand(gf, r1);
+        ggml_build_forward_expand(gf, r2);
+
+        ggml_graph_compute_helper(work_buffer, gf, 4);
+
+        // check that r1 and r2 are the same
+        {
+            double sum0 = 0.0f;
+            double sum1 = 0.0f;
+            double diff = 0.0f;
+
+            const float * r1_data = (float *) r1->data;
+            const float * r2_data = (float *) r2->data;
+
+            const int n_elements = ggml_nelements(r1);
+
+            for (int i = 0; i < n_elements; ++i) {
+                sum0 += fabs(r1_data[i]);
+                sum1 += fabs(r2_data[i]);
+                diff += fabs(r1_data[i] - r2_data[i]);
+                //if (fabs(r1_data[i] - r2_data[i]) > 0.0001f) {
+                //    printf("%d: %f %f\n", i, r1_data[i], r2_data[i]);
+                //    printf("diff: %f\n", fabs(r1_data[i] - r2_data[i]));
+                //}
+            }
+
+            //for (int i = 4096; i < 4096 + 128; ++i) {
+            //    printf("%f %f\n", r1_data[i], r2_data[i]);
+            //}
+
+            printf("mode: %d\n", mode);
+            printf("sum0: %f\n", sum0);
+            printf("sum1: %f\n", sum1);
+            printf("diff: %f\n", diff);
+            printf("rel err: %f\n", diff / sum0);
+            printf("rel err: %f\n", diff / sum1);
+
+            GGML_ASSERT(diff / sum0 < 0.0001f);
+            GGML_ASSERT(diff / sum1 < 0.0001f);
+        }
+    }
+
+    ggml_free(ctx0);
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-sampling.cpp b/llama.cpp/tests/test-sampling.cpp
new file mode 100644
index 0000000..7cd96c5
--- /dev/null
+++ b/llama.cpp/tests/test-sampling.cpp
@@ -0,0 +1,400 @@
+#include "ggml.h"
+#include "llama.h"
+
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <vector>
+
+extern struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers);
+
+static void dump(const llama_token_data_array * cur_p) {
+    for (size_t i = 0; i < cur_p->size; i++) {
+        printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+}
+
+#define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0)
+
+struct sampler_tester {
+    sampler_tester(size_t n_vocab) {
+        cur.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+            const float logit = logf(token_id);
+            cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
+        }
+
+        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+    }
+
+    sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected(probs_expected) {
+        cur.reserve(probs.size());
+        for (llama_token token_id = 0; token_id < (llama_token)probs.size(); token_id++) {
+            const float logit = logf(probs[token_id]);
+            cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});
+        }
+
+        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+    }
+
+    void apply(llama_sampler * sampler) {
+        llama_sampler_apply(sampler, &cur_p);
+        llama_sampler_free(sampler);
+    }
+
+    void check() {
+        GGML_ASSERT(cur_p.size == probs_expected.size());
+        for (size_t i = 0; i < cur_p.size; i++) {
+            GGML_ASSERT(fabs(cur_p.data[i].p - probs_expected[i]) < 1e-5);
+        }
+    }
+
+    llama_token_data_array cur_p;
+
+private:
+    const std::vector<float> probs_expected;
+
+    std::vector<llama_token_data> cur;
+};
+
+static void test_temp(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_temp(temp));
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_temp_ext(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp, float delta, float exponent) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_top_k(const std::vector<float> & probs, const std::vector<float> & probs_expected, int k) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_top_k(k));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_top_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_top_p(p, 0));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_min_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_min_p(p, 0));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_xtc(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p, float t) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_xtc(p, t, 0, 0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_typical(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_typical(p, 0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_penalties(
+    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
+    const std::vector<float> & probs_expected, float repeat_penalty, float alpha_frequency, float alpha_presence
+) {
+    GGML_ASSERT(probs.size() == probs_expected.size());
+
+    sampler_tester tester(probs, probs_expected);
+
+    auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
+
+    for (size_t i = 0; i < last_tokens.size(); i++) {
+        llama_sampler_accept(sampler, last_tokens[i]);
+    }
+
+    DUMP(&tester.cur_p);
+    tester.apply(sampler);
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_dry(
+    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
+    const std::vector<float> & expected_probs, float dry_multiplier, float dry_base,
+    int dry_allowed_length, int dry_penalty_last_n,
+    const std::vector<std::vector<llama_token>> & seq_breakers
+) {
+    GGML_ASSERT(probs.size() == expected_probs.size());
+
+    sampler_tester tester(probs, expected_probs);
+
+    auto * sampler = llama_sampler_init_dry_testing(1024, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);
+
+    for (size_t i = 0; i < last_tokens.size(); i++) {
+        llama_sampler_accept(sampler, last_tokens[i]);
+    }
+
+    DUMP(&tester.cur_p);
+    tester.apply(sampler);
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
+    tester.check();
+}
+
+static void test_top_n_sigma(const std::vector<float> & probs, const std::vector<float> & probs_expected, int n) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_top_n_sigma(n));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
+) {
+    sampler_tester tester(n_vocab);
+
+          llama_token min_token_id = 0;
+    const llama_token max_token_id = n_vocab - 1;
+
+    for (auto s : samplers_sequence) {
+        switch (s) {
+            case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
+            case 'y': GGML_ABORT("typical test not implemented");
+            case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
+            case 'm': tester.apply(llama_sampler_init_min_p(min_p, 1)); break;
+            case 't': GGML_ABORT("temperature test not implemented");
+            default : GGML_ABORT("Unknown sampler");
+        }
+
+        tester.apply(llama_sampler_init_dist(0));
+
+        auto & cur_p = tester.cur_p;
+
+        const int size = cur_p.size;
+
+        if (s == 'k') {
+            const int expected_size = std::min(size, top_k);
+            min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
+
+            GGML_ASSERT(size == expected_size);
+            GGML_ASSERT(cur_p.data[0].id == max_token_id);
+            GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
+        } else if (s == 'p') {
+            const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
+            const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
+
+                min_token_id  = n_vocab;
+            int expected_size = 0;
+            int cumsum        = 0;
+            do { // do-while because always at least one token is sampled
+                min_token_id--;
+                expected_size++;
+
+                cumsum += min_token_id;
+            } while (cumsum < softmax_numerator_target);
+
+            // token 0 has p == 0, need special consideration for cumsum because top_p immediately returns
+            if (min_token_id == 1) {
+                min_token_id--;
+                expected_size += 1;
+            }
+
+            GGML_ASSERT(size == expected_size);
+            GGML_ASSERT(!cur_p.sorted || cur_p.data[0].id == max_token_id);
+            GGML_ASSERT(!cur_p.sorted || cur_p.data[expected_size-1].id == min_token_id);
+        } else if (s == 'm') {
+            int expected_size = ceilf((1.0f - min_p) * n_vocab);
+            expected_size = std::max(expected_size, 1);
+            expected_size = std::min(expected_size, size);
+
+            min_token_id = floorf(min_p * n_vocab);
+            min_token_id = std::max(min_token_id, 1);
+            min_token_id = std::max(min_token_id, (llama_token)(n_vocab - size));
+            min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
+
+            GGML_ASSERT(size == expected_size);
+            GGML_ASSERT(!cur_p.sorted || cur_p.data[0].id == max_token_id);
+            GGML_ASSERT(!cur_p.sorted || cur_p.data[expected_size-1].id == min_token_id);
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    }
+
+    printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%5d top_p=%f min_p=%f\n",
+           samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
+}
+
+static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {
+    std::vector<llama_token_data> cur(data.size());
+    std::copy(data.begin(), data.end(), cur.begin());
+    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    llama_sampler_apply(cnstr, &cur_p);
+    llama_sampler_reset(cnstr);
+    const int64_t t_start = ggml_time_us();
+    for (int i = 0; i < n_iter; i++) {
+        std::copy(data.begin(), data.end(), cur.begin());
+        llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+        llama_sampler_apply(cnstr, &cur_p);
+        llama_sampler_reset(cnstr);
+    }
+    const int64_t t_end = ggml_time_us();
+    llama_sampler_free(cnstr);
+    printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
+}
+
+#define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
+
+static void test_perf() {
+    const int n_vocab = 1 << 17;
+
+    std::vector<llama_token_data> data;
+
+    data.reserve(n_vocab);
+    for (int i = 0; i < n_vocab; i++) {
+        const float logit = 2.0f*((double)(rand())/RAND_MAX - 0.5);
+        data.emplace_back(llama_token_data{i, logit, 0.0f});
+    }
+
+    BENCH(llama_sampler_init_top_k  (40),                     data, 32);
+    BENCH(llama_sampler_init_top_p  (0.8f, 1),                data, 32);
+    BENCH(llama_sampler_init_min_p  (0.2f, 1),                data, 32);
+    BENCH(llama_sampler_init_typical(0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_xtc    (1.0f, 0.1f, 1, 1),       data, 32);
+}
+
+int main(void) {
+    ggml_time_init();
+
+    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 1.0f);
+    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.0f, 0.0f, 0.0f, 1.0f}, 0.0f);
+
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 1.0f, 0.0f, 1.0f);
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.0f, 0.0f, 0.0f, 1.0f}, 0.0f, 0.0f, 1.0f);
+
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1);
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3);
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 0);
+
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 0);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f}, 0.7f);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 0.8f);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 1.0f);
+
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f/1.0f, 0.2f/1.0f, 0.3f/1.0f, 0.4f/1.0f}, 0.00f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f/1.0f, 0.2f/1.0f, 0.3f/1.0f, 0.4f/1.0f}, 0.24f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.2f/0.9f, 0.3f/0.9f, 0.4f/0.9f},            0.26f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.2f/0.9f, 0.3f/0.9f, 0.4f/0.9f},            0.49f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.3f/0.7f, 0.4f/0.7f},                       0.51f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.3f/0.7f, 0.4f/0.7f},                       0.74f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.05f);
+
+    printf("XTC should:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                                0.99f, 0.09f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.2f, 0.1f},                          0.99f, 0.19f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.3f, 0.2f, 0.1f},                    0.99f, 0.29f);
+
+    printf("XTC should not:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.39f);
+
+    test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f},            0.5f);
+    test_typical({0.4f, 0.2f, 0.2f, 0.2f},     {0.2f, 0.2f, 0.2f}, 0.5f);
+
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0, 0.25f, 0.25f, 0.25f, 0.25f},   50.0f, 0.0f, 0.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0, 0, 0, 0.5f, 0.5f},       50.0f, 0.0f, 0.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0, 0, 0, 0.5f, 0.5f}, 50.0f, 0.0f, 0.0f);
+
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.000011f, 0.249997f, 0.249997f, 0.249997f, 0.249997f}, 1.0f, 5.0f, 5.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.000023f, 0.000023f, 0.000023f, 0.499966f, 0.499966f}, 1.0f, 5.0f, 5.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.000000f, 0.000023f, 0.000023f, 0.499977f, 0.499977f}, 1.0f, 5.0f, 5.0f);
+
+
+    test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1}, {0.25f, 0.25f, 0.25f, 0.25f}, 1.0f, 1.1f, 2, 4, {});
+    test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1, 2, 0, 1}, {0.296923f, 0.296923f, 0.109232f, 0.296923f}, 1.0f, 1.1f, 2, 5, {});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 2, 6, {{3}});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 1}, {0.241818f, 0.241818f, 0.032727f, 0.241818f, 0.241818f}, 2.0f, 1.1f, 2, 5, {});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 4, 7, {});
+
+    test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f, 0.0f, 0.0f}, 1.00f);
+    test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 0.00f); // top_n_sigma == 0 now represents a no-op rather than greedy decoding as of PR#13345
+    test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 3.00f);
+
+    test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
+    test_sampler_queue(10000, "k",     1, 1.0f, 1.0f);
+    test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);
+    test_sampler_queue(10000, "p", 10000, 0.0f, 1.0f);
+    test_sampler_queue(10000, "m", 10000, 1.0f, 1.0f);
+    test_sampler_queue(10000, "m", 10000, 1.0f, 1e-12);
+
+    test_sampler_queue(10000, "k",   100, 1.0000f, 1.0f);
+    test_sampler_queue(10000, "p", 10000, 0.0003f, 1.0f);
+    test_sampler_queue(10000, "p", 10000, 0.8000f, 1.0f);
+    test_sampler_queue(10000, "m", 10000, 1.0000f, 9997.9f/9999.0f);
+    test_sampler_queue(10000, "m", 10000, 1.0000f, 0.1f);
+
+    test_sampler_queue(10000, "kp", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "km", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "pk", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "pm", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "mk", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "mp", 100, 0.8f, 9997.9f/9999.0f);
+    test_sampler_queue(10000, "mp", 100, 0.8f, 0.1f);
+
+    test_sampler_queue(10000, "kpm", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "kmp", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "pkm", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "pmk", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "mkp", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "mpk", 100, 0.8f, 0.1f);
+
+    printf("OK\n");
+
+    test_perf();
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-state-restore-fragmented.cpp b/llama.cpp/tests/test-state-restore-fragmented.cpp
new file mode 100644
index 0000000..481b39d
--- /dev/null
+++ b/llama.cpp/tests/test-state-restore-fragmented.cpp
@@ -0,0 +1,122 @@
+// Test for state restore with fragmented KV cache
+// This tests the fix for: https://github.com/ggml-org/llama.cpp/issues/17527
+// The issue was that state restore required contiguous KV cache slots,
+// which fails when the cache is fragmented.
+//
+// The fix changes find_slot(ubatch, true) to find_slot(ubatch, false)
+// in state_read_meta(), allowing non-contiguous slot allocation.
+
+#include "arg.h"
+#include "common.h"
+#include "llama.h"
+
+#include <vector>
+#include <cstdio>
+#include <cstring>
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.sampling.seed = 1234;
+    params.kv_unified = true;
+    params.n_parallel = 3;
+    params.n_ctx = 256;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+        return 1;
+    }
+
+    common_init();
+
+    // init
+    common_init_result_ptr llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init->model();
+    llama_context * ctx = llama_init->context();
+
+    if (model == nullptr || ctx == nullptr) {
+        fprintf(stderr, "%s : failed to init\n", __func__);
+        return 1;
+    }
+
+    GGML_UNUSED(model);
+
+    // tokenize prompt
+    std::vector<llama_token> tokens(70, 1);
+
+    // interleave the 3 sequences:
+    // 01201230123...
+    llama_batch batch = llama_batch_init(params.n_parallel*tokens.size(), 0, 1);
+    for (size_t i = 0; i < tokens.size(); i++) {
+        for (int s = 0; s < params.n_parallel; ++s) {
+            common_batch_add(batch, tokens[i], i, {s}, false);
+        }
+    }
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (llama_decode(ctx, batch)) {
+        fprintf(stderr, "%s : failed to decode seq 0\n", __func__);
+        return 1;
+    }
+
+    fprintf(stderr, "%s : processed prompt on seq 0, 1, 2 (%zu tokens each)\n", __func__, tokens.size());
+
+    // Save state of seq 1
+    std::vector<uint8_t> seq_state(llama_state_seq_get_size(ctx, 1));
+    const size_t ncopy = llama_state_seq_get_data(ctx, seq_state.data(), seq_state.size(), 1);
+    if (ncopy != seq_state.size()) {
+        fprintf(stderr, "%s : failed to save seq 1 state\n", __func__);
+        return 1;
+    }
+    fprintf(stderr, "%s : saved seq 1 state, %zu bytes\n", __func__, ncopy);
+
+    // clear seq 1 to create a "hole" in the KV cache (fragmentation)
+    // 0.20.20.20.2....
+    llama_memory_t mem = llama_get_memory(ctx);
+    llama_memory_seq_rm(mem, 1, -1, -1);
+    fprintf(stderr, "%s : cleared seq 1 to create fragmentation\n", __func__);
+
+    // Now the cache has holes where seq 1 was
+    // This creates fragmentation - there's no contiguous block large enough
+    // for the seq 1 state if we only look for contiguous slots
+
+    // Restore seq 1 state into seq 1 (should work with non-contiguous allocation)
+    // We use seq 1 since it's a valid sequence ID (0 to n_parallel-1)
+    // Before the fix, this would fail with "failed to find available cells in kv cache"
+    const size_t nset = llama_state_seq_set_data(ctx, seq_state.data(), seq_state.size(), 1);
+    if (nset != seq_state.size()) {
+        fprintf(stderr, "%s : FAILED to restore seq state into fragmented cache (got %zu, expected %zu)\n",
+                __func__, nset, seq_state.size());
+        fprintf(stderr, "%s : This is the bug - state restore fails with fragmented KV cache\n", __func__);
+        llama_batch_free(batch);
+        return 1;
+    }
+    fprintf(stderr, "%s : restored state into seq 1, %zu bytes\n", __func__, nset);
+
+    // Verify we can decode with the restored state
+    // Generate one token to verify the restored state is usable
+    auto sparams = llama_sampler_chain_default_params();
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
+
+    auto next_token = llama_sampler_sample(smpl, ctx, -1);
+    auto next_token_str = common_token_to_piece(ctx, next_token);
+
+    common_batch_clear(batch);
+    common_batch_add(batch, next_token, (int)tokens.size(), {1}, true);
+
+    if (llama_decode(ctx, batch)) {
+        fprintf(stderr, "%s : failed to decode with restored state\n", __func__);
+        llama_sampler_free(smpl);
+        llama_batch_free(batch);
+        return 1;
+    }
+
+    fprintf(stderr, "%s : successfully decoded with restored state, generated: '%s'\n", __func__, next_token_str.c_str());
+    fprintf(stderr, "%s : SUCCESS - state restore works with fragmented KV cache\n", __func__);
+
+    llama_sampler_free(smpl);
+    llama_batch_free(batch);
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-thread-safety.cpp b/llama.cpp/tests/test-thread-safety.cpp
new file mode 100644
index 0000000..bcb86c3
--- /dev/null
+++ b/llama.cpp/tests/test-thread-safety.cpp
@@ -0,0 +1,164 @@
+// thread safety test
+// - Loads a copy of the same model on each GPU, plus a copy on the CPU
+// - Creates n_parallel (--parallel) contexts per model
+// - Runs inference in parallel on each context
+
+#include <array>
+#include <thread>
+#include <vector>
+#include <atomic>
+#include "llama.h"
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "sampling.h"
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+        return 1;
+    }
+
+    common_init();
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+
+    //llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+    //    if (level == GGML_LOG_LEVEL_ERROR) {
+    //        common_log_add(common_log_main(), level, "%s", text);
+    //    }
+    //}, NULL);
+
+    auto cparams = common_context_params_to_llama(params);
+
+    // each context has a single sequence
+    cparams.n_seq_max = 1;
+
+    int dev_count = ggml_backend_dev_count();
+    std::vector<std::array<ggml_backend_dev_t, 2>> gpus;
+    for (int i = 0; i < dev_count; ++i) {
+        auto * dev = ggml_backend_dev_get(i);
+        if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+            gpus.push_back({dev, nullptr});
+        }
+    }
+    const int gpu_dev_count = (int)gpus.size();
+    const int num_models = gpu_dev_count + 1 + 1; // GPUs + 1 CPU model + 1 layer split
+    //const int num_models = std::max(1, gpu_dev_count);
+    const int num_contexts = std::max(1, params.n_parallel);
+
+    std::vector<llama_model_ptr> models;
+    std::vector<std::thread> threads;
+    std::atomic<bool> failed = false;
+
+    for (int m = 0; m < num_models; ++m) {
+        auto mparams = common_model_params_to_llama(params);
+
+        if (m < gpu_dev_count) {
+            mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
+            mparams.devices = gpus[m].data();
+        } else if (m == gpu_dev_count) {
+            mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
+            mparams.main_gpu = -1; // CPU model
+        } else {
+            mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;
+        }
+
+        llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+        if (model == NULL) {
+            LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+            return 1;
+        }
+
+        models.emplace_back(model);
+    }
+
+    for  (int m = 0; m < num_models; ++m) {
+        auto * model = models[m].get();
+        for (int c = 0; c < num_contexts; ++c) {
+            threads.emplace_back([&, m, c, model]() {
+                LOG_INF("Creating context %d/%d for model %d/%d\n", c + 1, num_contexts, m + 1, num_models);
+
+                llama_context_ptr ctx { llama_init_from_model(model, cparams) };
+                if (ctx == NULL) {
+                    LOG_ERR("failed to create context\n");
+                    failed.store(true);
+                    return;
+                }
+
+                std::unique_ptr<common_sampler, decltype(&common_sampler_free)> sampler { common_sampler_init(model, params.sampling), common_sampler_free };
+                if (sampler == NULL) {
+                    LOG_ERR("failed to create sampler\n");
+                    failed.store(true);
+                    return;
+                }
+
+                llama_batch batch = {};
+                {
+                    auto prompt = common_tokenize(ctx.get(), params.prompt, true);
+                    if (prompt.empty()) {
+                        LOG_ERR("failed to tokenize prompt\n");
+                        failed.store(true);
+                        return;
+                    }
+                    batch = llama_batch_get_one(prompt.data(), prompt.size());
+                    if (llama_decode(ctx.get(), batch)) {
+                        LOG_ERR("failed to decode prompt\n");
+                        failed.store(true);
+                        return;
+                    }
+                }
+
+                const auto * vocab = llama_model_get_vocab(model);
+                std::string result = params.prompt;
+
+                for (int i = 0; i < params.n_predict; i++) {
+                    llama_token token;
+                    if (batch.n_tokens > 0) {
+                        token = common_sampler_sample(sampler.get(), ctx.get(), batch.n_tokens - 1);
+                    } else {
+                        token = llama_vocab_bos(vocab);
+                    }
+
+                    result += common_token_to_piece(ctx.get(), token);
+
+                    if (llama_vocab_is_eog(vocab, token)) {
+                        break;
+                    }
+
+                    batch = llama_batch_get_one(&token, 1);
+
+                    int ret = llama_decode(ctx.get(), batch);
+                    if (ret == 1 && i > 0) {
+                        LOG_INF("Context full, stopping generation.\n");
+                        break;
+                    }
+
+                    if (ret != 0) {
+                        LOG_ERR("Model %d/%d, Context %d/%d: failed to decode\n", m + 1, num_models, c + 1, num_contexts);
+                        failed.store(true);
+                        return;
+                    }
+                }
+
+                LOG_INF("Model %d/%d, Context %d/%d: %s\n\n", m + 1, num_models, c + 1, num_contexts, result.c_str());
+            });
+        }
+    }
+
+    for (auto & thread : threads) {
+        thread.join();
+    }
+
+    if (failed) {
+        LOG_ERR("One or more threads failed.\n");
+        return 1;
+    }
+
+    LOG_INF("All threads finished without errors.\n");
+    return 0;
+}
diff --git a/llama.cpp/tests/test-tokenizer-0.cpp b/llama.cpp/tests/test-tokenizer-0.cpp
new file mode 100644
index 0000000..37f8312
--- /dev/null
+++ b/llama.cpp/tests/test-tokenizer-0.cpp
@@ -0,0 +1,312 @@
+#include "llama.h"
+#include "common.h"
+#include "console.h"
+
+#include <cstdio>
+#include <string>
+#include <map>
+#include <vector>
+#include <fstream>
+#include <thread>
+
+//static const std::map<std::string, std::vector<llama_token>> & k_tests() {
+//    static std::map<std::string, std::vector<llama_token>> _k_tests = {
+//        { ""                      , {  }, },
+//        { " "                     , {     220, }, },
+//        { "  "                    , {     256, }, },
+//        { "   "                   , {     262, }, },
+//        { "\t"                    , {     197, }, },
+//        { "\n"                    , {     198, }, },
+//        { "\n\n"                  , {     271, }, },
+//        { "\n\n\n"                , {    1432, }, },
+//        { "\t\n"                  , {    1602, }, },
+//        { "Hello world"           , {    9906,   1917, }, },
+//        { " Hello world"          , {   22691,   1917, }, },
+//        { "Hello World"           , {    9906,   4435, }, },
+//        { " Hello World"          , {   22691,   4435, }, },
+//        { " Hello World!"         , {   22691,   4435,      0, }, },
+//        { "Hello, world!"         , {    9906,     11,   1917,      0, }, },
+//        { " Hello, world!"        , {   22691,     11,   1917,      0, }, },
+//        { " this is 🦙.cpp"        , {     420,    374,  11410,     99,    247,     13,  11055, }, },
+//        { "w048 7tuijk dsdfhu"    , {      86,  23904,    220,     22,     83,   2005,  42908,  11729,   3013,  17156, }, },
+//        { "нещо на Български"     , {   79862, 102118,  13373,  64571,  34694,   3114, 112203,  80112, }, },
+//        { "កាន់តែពិសេសអាចខលចេញ"   , {   21549,    222,  98629,    241,  45358,    233,  21549,    237,  45358,    224,  21549,    244,  21549,    115,  21549,    253,  45358,    223,  21549,    253,  21549,     95,  98629,    227,  21549,    223,  21549,    249,  21549,    227,  45358,    223,  21549,    231, }, },
+//        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    9468,    248,    222,    320,   8416,      8,  27623,    114, 102470,   9468,    234,    104,  31643,    320,  36773, 100166,  98634,      8,  26602,    227,    320,   3323,  43465,    430,    706,   1202,   1866,   4037,      8, }, },
+//        { "Hello"                 , {    9906, }, },
+//        { " Hello"                , {   22691, }, },
+//        { "  Hello"               , {     220,  22691, }, },
+//        { "   Hello"              , {     256,  22691, }, },
+//        { "    Hello"             , {     262,  22691, }, },
+//        { "    Hello\n    Hello"  , {     262,  22691,    198,    262,  22691, }, },
+//        { " ("                    , {     320, }, },
+//        { "\n ="                  , {     198,    284, }, },
+//        { "' era"                 , {       6,  11639, }, },
+//        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {    9906,     11,    379,  65948,      0,   2650,    527,    499,  27623,    223,    949,  37046, 101067,  19000,  23182, 102301,   9263,  18136,     16,  36827,  21909, }, },
+//        { "3"                     , {      18, }, },
+//        { "33"                    , {    1644, }, },
+//        { "333"                   , {    8765, }, },
+//        { "3333"                  , {    8765,     18, }, },
+//        { "33333"                 , {    8765,   1644, }, },
+//        { "333333"                , {    8765,   8765, }, },
+//        { "3333333"               , {    8765,   8765,     18, }, },
+//        { "33333333"              , {    8765,   8765,   1644, }, },
+//        { "333333333"             , {    8765,   8765,   8765, }, },
+//    };
+//
+//    return _k_tests;
+//}
+
+using llama_tests = std::map<std::string, std::vector<llama_token>>;
+
+static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) {
+    llama_tests tests;
+
+    std::ifstream ifs_inp(fname_inp);
+    if (!ifs_inp) {
+        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str());
+        return tests;
+    }
+
+    std::string sraw((std::istreambuf_iterator<char>(ifs_inp)), std::istreambuf_iterator<char>());
+
+    std::ifstream ifs_out(fname_out);
+    if (!ifs_out) {
+        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+        return tests;
+    }
+
+    std::vector<std::string> sout;
+    for (std::string line; std::getline(ifs_out, line);) {
+        sout.push_back(line);
+    }
+
+    const std::string sep = "\n__ggml_vocab_test__\n";
+
+    std::vector<std::string> sinp;
+
+    size_t pos = 0;
+    while (pos < sraw.size()) {
+        const size_t next = sraw.find(sep, pos);
+        if (next == std::string::npos) {
+            sinp.push_back(sraw.substr(pos));
+            break;
+        }
+        sinp.push_back(sraw.substr(pos, next - pos));
+        pos = next + sep.size();
+    }
+
+    if (sinp.size() != sout.size()) {
+        fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__);
+        return tests;
+    }
+
+    for (size_t i = 0; i < sinp.size(); ++i) {
+        const std::string & s = sinp[i];
+        const std::string & o = string_strip(sout[i]);
+
+        std::vector<llama_token> toks;
+
+        size_t pos = 0;
+        while (pos < o.size()) {
+            size_t next = o.find(' ', pos);
+            if (next == std::string::npos) {
+                next = o.size();
+            }
+            const std::string stok = o.substr(pos, next - pos);
+            toks.push_back(std::stoi(stok));
+            pos = next + 1;
+        }
+
+        tests[s] = toks;
+    }
+
+    return tests;
+}
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+
+    const std::string fname_inp = fname + ".inp";
+    const std::string fname_out = fname + ".out";
+
+    std::string fname_text;
+    if (argc > 2) {
+        fname_text = argv[2];
+    }
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init();
+
+    // load the vocab
+    {
+        auto mparams = llama_model_default_params();
+
+        mparams.vocab_only = true;
+
+        model = llama_model_load_from_file(fname.c_str(), mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_init_from_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_model_free(model);
+            return 1;
+        }
+    }
+
+#ifdef _WIN32
+    // We need this for unicode console support
+    console::init(false, false);
+    atexit([]() { console::cleanup(); });
+#endif
+
+    bool success = true;
+
+    const auto k_tests = [&]() -> llama_tests {
+        if (!fname_text.empty()) {
+            return {};
+        }
+
+        const auto res = read_tests(fname_inp, fname_out);
+
+        if (res.empty()) {
+            fprintf(stderr, "%s : error: no tests found\n", __func__);
+            exit(1);
+        }
+
+        return res;
+    }();
+
+    const bool add_special = false;
+
+    // multi-threaded tokenization
+    const int nthread = std::thread::hardware_concurrency();
+    std::vector<std::thread> threads(nthread);
+
+    for (int i = 0; i < nthread; i++) {
+        threads[i] = std::thread([&, i]() {
+            for (const auto & test_kv : k_tests) {
+                const std::vector<llama_token> res = common_tokenize(ctx, test_kv.first, add_special, false);
+
+                // here only print the result of the first thread
+                // because the other threads are running the same tests
+                if (i != 0) {
+                    continue;
+                }
+
+                printf("\n");
+                printf("src: '%s'\n", test_kv.first.c_str());
+                printf("res: '%s'\n", common_detokenize(ctx, res).c_str());
+                printf("tok: ");
+                for (const auto & tok : res) {
+                    printf("%d ", tok);
+                }
+                printf("\n");
+
+                bool correct = res.size() == test_kv.second.size();
+                for (int i = 0; i < (int) res.size() && correct; ++i) {
+                    if (test_kv.second[i] != res[i]) {
+                        correct = false;
+                    }
+                }
+
+                if (!correct) {
+                    fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
+                    fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
+                        common_detokenize(ctx, res).c_str(),
+                        common_detokenize(ctx, test_kv.second).c_str());
+                    fprintf(stderr, "%s : expected tokens: ", __func__);
+                    for (const auto & t : test_kv.second) {
+                        fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
+                    }
+                    fprintf(stderr, "\n");
+                    fprintf(stderr, "%s : got tokens:      ", __func__);
+                    for (const auto & t : res) {
+                        fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
+                    }
+                    fprintf(stderr, "\n");
+
+                    success = false;
+                }
+            }
+        });
+    }
+
+    for (int i = 0; i < nthread; i++) {
+        threads[i].join();
+    }
+
+    // single threaded tokenization
+    if (!fname_text.empty()) {
+        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
+
+        std::string text;
+        {
+            std::ifstream ifs(fname_text);
+            if (!ifs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
+                return 1;
+            }
+            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
+        }
+
+        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
+
+        std::vector<llama_token> res;
+
+        {
+            const auto t_start = ggml_time_us();
+
+            res = common_tokenize(ctx, text, add_special, false);
+
+            const auto t_end = ggml_time_us();
+
+            fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0);
+        }
+
+        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
+
+        {
+            const std::string fname_out = fname_text + ".tokcpp";
+
+            std::ofstream ofs(fname_out);
+            if (!ofs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+                return 1;
+            }
+
+            for (const auto & tok : res) {
+                //ofs << tok << " '" << string_strip(llama_detokenize(ctx, std::vector<int>{tok})) << "'" << std::endl;
+                ofs << tok << "\n";
+            }
+        }
+
+        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
+    }
+
+    llama_free(ctx);
+    llama_model_free(model);
+
+    llama_backend_free();
+
+    printf("\n");
+    printf("Tests %s\n", success ? "passed" : "failed");
+
+    return success ? 0 : 3;
+}
diff --git a/llama.cpp/tests/test-tokenizer-0.py b/llama.cpp/tests/test-tokenizer-0.py
new file mode 100644
index 0000000..cd760d1
--- /dev/null
+++ b/llama.cpp/tests/test-tokenizer-0.py
@@ -0,0 +1,46 @@
+import time
+import argparse
+
+from transformers import AutoTokenizer
+
+parser = argparse.ArgumentParser()
+parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
+parser.add_argument("--fname-tok",   help="path to a text file to tokenize", required=True)
+args = parser.parse_args()
+
+dir_tokenizer = args.dir_tokenizer
+fname_tok = args.fname_tok
+
+tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
+
+print('tokenizing file: ', fname_tok) # noqa: NP100
+fname_out = fname_tok + '.tok'
+with open(fname_tok, 'r', encoding='utf-8') as f:
+    lines = f.readlines()
+    s = ''.join(lines)
+    t_start = time.time()
+    res = tokenizer.encode(s, add_special_tokens=False)
+    t_end = time.time()
+    print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100
+    with open(fname_out, 'w', encoding='utf-8') as f:
+        for x in res:
+            # LLaMA v3 for some reason strips the space for these tokens (and others)
+            # if x == 662:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 1174:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 2564:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 758:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 949:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 5354:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # else:
+            #     f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
+            # f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
+            f.write(str(x) + '\n')
+    print('len(res): ', len(res)) # noqa: NP100
+    print('len(lines): ', len(lines)) # noqa: NP100
+print('results written to: ', fname_out) # noqa: NP100
diff --git a/llama.cpp/tests/test-tokenizer-0.sh b/llama.cpp/tests/test-tokenizer-0.sh
new file mode 100755
index 0000000..7ef009d
--- /dev/null
+++ b/llama.cpp/tests/test-tokenizer-0.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+#
+# Usage:
+#
+#   test-tokenizer-0.sh <name> <input>
+#
+
+if [ $# -ne 2 ]; then
+    printf "Usage: $0 <name> <input>\n"
+    exit 1
+fi
+
+name=$1
+input=$2
+
+make -j tests/test-tokenizer-0
+
+printf "Testing %s on %s ...\n" $name $input
+
+set -e
+
+printf "Tokenizing using (py)  Python AutoTokenizer ...\n"
+python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
+
+printf "Tokenizing using (cpp) llama.cpp ...\n"
+./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
+
+cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
+cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
+
+set +e
+
+diff $input.tok $input.tokcpp > /dev/null 2>&1
+
+if [ $? -eq 0 ]; then
+    printf "Tokenization is correct!\n"
+else
+    diff $input.tok $input.tokcpp | head -n 32
+
+    printf "Tokenization differs!\n"
+fi
diff --git a/llama.cpp/tests/test-tokenizer-1-bpe.cpp b/llama.cpp/tests/test-tokenizer-1-bpe.cpp
new file mode 100644
index 0000000..505dbfd
--- /dev/null
+++ b/llama.cpp/tests/test-tokenizer-1-bpe.cpp
@@ -0,0 +1,155 @@
+#include "llama.h"
+#include "common.h"
+#include "console.h"
+
+#include "../src/unicode.h"
+
+#include <cassert>
+#include <codecvt>
+#include <cstdio>
+#include <cstring>
+#include <locale>
+#include <string>
+#include <thread>
+#include <vector>
+#include <atomic>
+
+int main(int argc, char **argv) {
+    if (argc < 2 || argc > 3) {
+        fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+    bool ignore_merges = false;
+    if (argc == 3) {
+        if (std::strcmp(argv[2], "--ignore-merges") != 0) {
+            fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
+            return 1;
+        }
+        ignore_merges = true;
+    }
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    if (ignore_merges) {
+        fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
+    }
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init();
+
+    // load the vocab
+    {
+        auto mparams = llama_model_default_params();
+
+        mparams.vocab_only = true;
+
+        model = llama_model_load_from_file(fname.c_str(), mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_init_from_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_model_free(model);
+            return 1;
+        }
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    //GGML_ASSERT(llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_BPE);
+    if (llama_vocab_type(vocab) != LLAMA_VOCAB_TYPE_BPE) {
+        return 99;
+    }
+
+#ifdef _WIN32
+    // We need this for unicode console support
+    console::init(false, false);
+    atexit([]() { console::cleanup(); });
+#endif
+
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
+    for (int i = 0; i < n_vocab; ++i) {
+        std::string str = common_detokenize(ctx, std::vector<int>(1, i));
+        try {
+            auto cps = unicode_cpts_from_utf8(str);
+            std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+            if (ignore_merges && tokens.size() > 1) {
+                fprintf(stderr,
+                        "%s : error: token %d detokenizes to '%s'(%zu) but "
+                        "tokenization of this to multiple tokens: [",
+                        __func__, i, str.c_str(), str.length());
+                fprintf(stderr, "%d", tokens[0]);
+                for (size_t i = 1; i < tokens.size(); i++) {
+                    fprintf(stderr, ", %d", tokens[i]);
+                }
+                fprintf(stderr, "]\n");
+                return 2;
+            }
+            std::string check = common_detokenize(ctx, tokens);
+            if (check != str) {
+                fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
+                    __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
+                return 2;
+            }
+        }
+        catch (const std::invalid_argument &) {
+            //fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
+        }
+    }
+
+    // unicode
+    {
+        const int nthread = std::thread::hardware_concurrency();
+
+        std::vector<std::thread> threads(nthread);
+
+        std::atomic_int errcode = {};
+
+        for (int i = 0; i < nthread; ++i) {
+            threads[i] = std::thread([i, nthread, ctx, &errcode]() {
+                for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
+                    if ((0x0000D800 <= cp && cp <= 0x0000DFFF) ||  // surrogates \p{Cs}
+                        (0x00040000 <= cp && cp <= 0x000E0000)) {  // undefined  \p{Cn}
+                        continue;
+                    }
+
+                    std::string str = unicode_cpt_to_utf8(cp);
+                    std::vector<llama_token> tokens = common_tokenize(ctx, str, false);
+                    std::string check = common_detokenize(ctx, tokens);
+                    if (cp != 9601 && str != check) {
+                        fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                                cp, check.c_str(), check.length(), str.c_str(), str.length());
+                        errcode = 3;
+                    }
+                }
+            });
+        }
+
+        for (auto & t : threads) {
+            t.join();
+        }
+
+        if (errcode) {
+            return errcode;
+        }
+    }
+
+    llama_free(ctx);
+    llama_model_free(model);
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-tokenizer-1-spm.cpp b/llama.cpp/tests/test-tokenizer-1-spm.cpp
new file mode 100644
index 0000000..8e370d2
--- /dev/null
+++ b/llama.cpp/tests/test-tokenizer-1-spm.cpp
@@ -0,0 +1,125 @@
+#include "llama.h"
+#include "common.h"
+#include "console.h"
+
+#include "../src/unicode.h"
+
+#include <cassert>
+#include <codecvt>
+#include <cstdio>
+#include <cstring>
+#include <locale>
+#include <string>
+#include <thread>
+#include <vector>
+#include <atomic>
+
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init();
+
+    // load the vocab
+    {
+        auto mparams = llama_model_default_params();
+
+        mparams.vocab_only = true;
+
+        model = llama_model_load_from_file(fname.c_str(), mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_init_from_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_model_free(model);
+            return 1;
+        }
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
+    if (llama_vocab_type(vocab) != LLAMA_VOCAB_TYPE_SPM) {
+        return 99;
+    }
+
+#ifdef _WIN32
+    // We need this for unicode console support
+    console::init(false, false);
+    atexit([]() { console::cleanup(); });
+#endif
+
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
+    for (int i = 0; i < n_vocab; ++i) {
+        std::string str = common_detokenize(ctx, std::vector<int>(1, i), true);
+        std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+        std::string check = common_detokenize(ctx, tokens);
+        if (check != str) {
+            fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
+                __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
+            return 2;
+        }
+    }
+
+    // unicode
+    {
+        const int nthread = std::thread::hardware_concurrency();
+
+        std::vector<std::thread> threads(nthread);
+
+        std::atomic_int errcode = {};
+
+        for (int i = 0; i < nthread; ++i) {
+            threads[i] = std::thread([i, nthread, ctx, &errcode]() {
+                for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
+                    if ((0x0000D800 <= cp && cp <= 0x0000DFFF) ||  // surrogates \p{Cs}
+                        (0x00040000 <= cp && cp <= 0x000E0000)) {  // undefined \p{Cn}
+                        continue;
+                    }
+
+                    std::string str = unicode_cpt_to_utf8(cp);
+                    std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+                    std::string check = common_detokenize(ctx, tokens);
+                    if (cp != 9601 && str != check) {
+                        fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                                cp, check.c_str(), check.length(), str.c_str(), str.length());
+                        errcode = 3;
+                    }
+                }
+            });
+        }
+
+        for (auto & t : threads) {
+            t.join();
+        }
+
+        if(errcode) {
+            return errcode;
+        }
+    }
+
+    llama_free(ctx);
+    llama_model_free(model);
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/llama.cpp/tests/test-tokenizer-random.py b/llama.cpp/tests/test-tokenizer-random.py
new file mode 100644
index 0000000..93e6976
--- /dev/null
+++ b/llama.cpp/tests/test-tokenizer-random.py
@@ -0,0 +1,566 @@
+# Test libllama tokenizer == AutoTokenizer.
+# Brute force random words/text generation.
+#
+# Sample usage:
+#
+#   python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
+#
+
+from __future__ import annotations
+
+import time
+import logging
+import argparse
+import subprocess
+import random
+import unicodedata
+
+from pathlib import Path
+from typing import Any, Iterator, cast
+from typing_extensions import Buffer
+
+import cffi
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+
+logger = logging.getLogger("test-tokenizer-random")
+
+
+class LibLlama:
+
+    DEFAULT_PATH_LLAMA_H = "./include/llama.h"
+    DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"]
+    DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so"  # CMakeLists.txt: BUILD_SHARED_LIBS ON
+
+    def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None):
+        path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
+        path_includes = path_includes or self.DEFAULT_PATH_INCLUDES
+        path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
+        (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_includes, path_libllama)
+        self.lib.llama_backend_init()
+
+    def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str) -> tuple[cffi.FFI, Any]:
+        cmd = ["gcc", "-O0", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="]
+        cmd += ["-I" + path for path in path_includes] + [path_llama_h]
+        res = subprocess.run(cmd, stdout=subprocess.PIPE)
+        assert (res.returncode == 0)
+        source = res.stdout.decode()
+        ffi = cffi.FFI()
+        if True:  # workarounds for pycparser
+            source = "typedef struct { } __builtin_va_list;" + "\n" + source
+            source = source.replace("sizeof (int)",    str(ffi.sizeof("int")))
+            source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
+            source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
+            source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
+        ffi.cdef(source, override=True)
+        lib = ffi.dlopen(path_libllama)
+        return (ffi, lib)
+
+    def model_default_params(self, **kwargs):
+        mparams = self.lib.llama_model_default_params()
+        for k, v in kwargs.items():
+            setattr(mparams, k, v)
+        return mparams
+
+    def context_default_params(self, **kwargs):
+        cparams = self.lib.llama_context_default_params()
+        for k, v in kwargs.items():
+            setattr(cparams, k, v)
+        return cparams
+
+
+class LibLlamaModel:
+
+    def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
+        self.lib: Any = libllama.lib
+        self.ffi = libllama.ffi
+        if isinstance(mparams, dict):
+            mparams = libllama.model_default_params(**mparams)
+        self.model = self.lib.llama_model_load_from_file(path_model.encode(), mparams)
+        if not self.model:
+            raise RuntimeError("error: failed to load model '%s'" % path_model)
+        if isinstance(cparams, dict):
+            cparams = libllama.context_default_params(**cparams)
+        self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
+        if not self.ctx:
+            raise RuntimeError("error: failed to create context for model '%s'" % path_model)
+        n_tokens_max = self.lib.llama_n_ctx(self.ctx)
+        self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
+        self.text_buff = self.ffi.new("uint8_t[]", 1024)
+
+    def free(self):
+        if self.ctx:
+            self.lib.llama_free(self.ctx)
+        if self.model:
+            self.lib.llama_model_free(self.model)
+        self.ctx = None
+        self.model = None
+        self.lib = None
+
+    def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]:
+        encoded_text: bytes = text.encode("utf-8")
+        num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
+        while num < 0 and len(self.token_ids) < (16 << 20):
+            self.token_ids = self.ffi.new("llama_token[]", -2 * num)
+            num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
+        return list(self.token_ids[0:num])
+
+    def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str:
+        if len(self.token_ids) < len(ids):
+            self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
+        for i, id in enumerate(ids):
+            self.token_ids[i] = id
+        num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
+        while num < 0 and len(self.text_buff) < (16 << 20):
+            self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
+            num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
+        return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
+
+
+class Tokenizer:
+
+    def encode(self, text: str) -> list[int]:
+        raise NotImplementedError
+
+    def decode(self, ids: list[int]) -> str:
+        raise NotImplementedError
+
+
+class TokenizerGroundtruth (Tokenizer):
+
+    def __init__(self, dir_tokenizer: str):
+        self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
+        # guess BOS and EOS
+        ids = self.encode("a")
+        assert 1 <= len(ids) <= 3
+        add_bos_token = len(ids) > 1 and self.model.bos_token_id == ids[0]
+        add_eos_token = len(ids) > 1 and self.model.eos_token_id == ids[-1]
+        self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token)
+        self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token)
+        # build vocab
+        tokens = list(self.model.get_vocab().values())
+        self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True)
+        self.vocab = list(sorted(self.vocab))
+        # tokens and lists
+        self.special_tokens = list(self.model.all_special_tokens)
+        self.added_tokens   = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False)
+        self.bos_token = self.model.bos_token
+        self.eos_token = self.model.eos_token
+
+    def encode(self, text: str) -> list[int]:
+        return self.model.encode(text, add_special_tokens=True)
+
+    def decode(self, ids: list[int]) -> str:
+        return self.model.decode(ids, skip_special_tokens=False)
+
+
+class TokenizerLlamaCpp (Tokenizer):
+
+    libllama: LibLlama | None = None
+
+    def __init__(self, vocab_file: str):
+        if not self.libllama:
+            self.libllama = LibLlama()
+        self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
+
+    def encode(self, text: str) -> list[int]:
+        return self.model.tokenize(text, add_special=True, parse_special=True)
+
+    def decode(self, ids: list[int]) -> str:
+        return self.model.detokenize(ids, remove_special=False, unparse_special=True)
+
+
+def generator_custom_text() -> Iterator[str]:
+    """General tests"""
+    yield from [
+        "",
+        " ",
+        "  ",
+        "   ",
+        "\t",
+        "\n",
+        "\n\n",
+        "\n\n\n",
+        "\t\n",
+        "Hello world",
+        " Hello world",
+        "Hello World",
+        " Hello World",
+        " Hello World!",
+        "Hello, world!",
+        " Hello, world!",
+        " this is 🦙.cpp",
+        "w048 7tuijk dsdfhu",
+        "нещо на Български",
+        "កាន់តែពិសេសអាចខលចេញ",
+        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
+        "Hello",
+        " Hello",
+        "  Hello",
+        "   Hello",
+        "    Hello",
+        "    Hello\n    Hello",
+        " (",
+        "\n =",
+        "' era",
+        "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
+        "3",
+        "33",
+        "333",
+        "3333",
+        "33333",
+        "333333",
+        "3333333",
+        "33333333",
+        "333333333",
+    ]
+
+
+def generator_custom_text_edge_cases() -> Iterator[str]:
+    """Edge cases found while debugging"""
+    yield from [
+        '\x1f-a',     # unicode_ranges_control, {0x00001C, 0x00001F}
+        '¼-a',        # unicode_ranges_digit, 0x00BC
+        '½-a',        # unicode_ranges_digit, 0x00BD
+        '¾-a',        # unicode_ranges_digit, 0x00BE
+        'a 〇b',      # unicode_ranges_digit, 0x3007
+        'Ⅵ-a',       # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
+        '\uFEFF//',   # unicode_ranges_control, 0xFEFF (BOM)
+        'Cửa Việt',   # llama-3, ignore_merges = true
+        '<s>a',       # Phi-3 fail
+        '<unk><|endoftext|><s>',  # Phi-3 fail
+        'a\na',            # bert fail
+        '"`',              # falcon
+        ' \u2e4e',         # falcon
+        '\n\x0b  ',        # falcon
+        'a\xa0\xa0\x00b',  # jina-v2-es
+        'one <mask>',      # jina-v2-es  <mask> lstrip=true
+        'a </s> b',        # rstrip phi-3
+        'a <mask> b',      # lstrip jina-v2
+        '\xa0aC',          # deepseek
+        '\u2029 \uA3E4',   # deepseek-llm
+        "a ?",
+        'å',               # mpt
+        '\U000ac517',      # utf-8 encode error, falcon
+        '\U000522f4',      # utf-8 encode error, starcoder
+        "<s><s><unk><s>a<s>b<s>c<unk>d<unk></s>",
+        "<s> <s> <unk><s>a<s>b<s>c<unk>d<unk></s>",
+    ]
+
+
+def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
+    """Brute force check all vocab words"""
+    yield from tokenizer.vocab
+
+
+def generator_ascii_lr_strip() -> Iterator[str]:
+    WHITESPACES = ["", " ", "  "]
+    CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
+    for char1 in CHARACTERS:
+        for char2 in CHARACTERS:
+            for lstrip in WHITESPACES:
+                for rstrip in WHITESPACES:
+                    yield lstrip + char1 + char2 + rstrip
+                    yield lstrip + char1 + rstrip + char2
+                    yield char1 + lstrip + char2 + rstrip
+
+
+def generator_apostrophe() -> Iterator[str]:
+    WHITESPACES = ["", " ", "  "]
+    CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
+    for char1 in CHARACTERS:
+        for char2 in CHARACTERS:
+            for lstrip in WHITESPACES:
+                for rstrip in WHITESPACES:
+                    yield char1 + lstrip + "'" + rstrip + char2
+                    yield char1 + char2 + lstrip + "'" + rstrip + "z"
+                    yield "a" + lstrip + "'" + rstrip + char1 + char2
+
+
+def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
+    WHITESPACES = ["", " ", "  ", "\n", "\r\n", "\n\n", "\t", "\t\t"]
+    all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens)))
+    for token in all_tokens:
+        for lstrip in WHITESPACES:
+            for rstrip in WHITESPACES:
+                yield lstrip + token + rstrip
+                yield "a" + lstrip + token + rstrip
+                yield lstrip + token + rstrip + "z"
+                yield "a" + lstrip + token + rstrip + "z"
+
+
+def generator_random_added_tokens(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
+    separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
+    all_tokens  = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens + separations)))
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        words = rand.choices(all_tokens, k=500)
+        if words and words[0] == tokenizer.bos_token:  # skip spam warning of double BOS
+            while len(words) > 1 and words[1] == tokenizer.bos_token:  # leave one starting BOS
+                words.pop(0)
+            if tokenizer.add_bos_token:  # drop all starting BOS
+                words.pop(0)
+        if words and words[-1] == tokenizer.eos_token:  # skip spam warning of double EOS
+            while len(words) > 1 and words[-2] == tokenizer.eos_token:  # leave one trailing EOS
+                words.pop(-1)
+            if tokenizer.add_bos_token:  # drop all trailing EOS
+                words.pop(-1)
+        yield "".join(words)
+
+
+def generator_random_chars(iterations=100) -> Iterator[str]:
+    """Brute force random text with simple characters"""
+
+    NUM_WORDS = 400
+    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
+    CHARS = list(sorted(set("""
+        ABCDEFGHIJKLMNOPQRSTUVWXYZ
+        abcdefghijklmnopqrstuvwxyz
+        ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
+        áéíóúàèìòùâêîôûäëïöü
+        .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
+    """)))
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        for _ in range(NUM_WORDS):
+            k = rand.randint(1, 7)
+            word = rand.choices(CHARS, k=k)
+            word.append(rand.choice(WHITESPACES))
+            text.append("".join(word))
+        yield "".join(text)
+
+
+def generator_unicodes() -> Iterator[str]:
+    """Iterate unicode characters"""
+
+    MAX_CODEPOINTS = 0x30000  # 0x110000
+
+    def _valid(cpt):
+        if cpt >= 0x30000:  # unassigned and supplement­ary
+            return False
+        # if cpt == 0x2029:  # deepseek-llm
+        #    return False
+        if unicodedata.category(chr(cpt)) in ("Cn", "Cs", "Co"):  # undefined, surrogates, private
+            return False
+        return True
+
+    characters = [chr(cpt) for cpt in range(0, MAX_CODEPOINTS) if _valid(cpt)]
+
+    yield from characters
+
+
+def generator_random_unicodes(iterations=100) -> Iterator[str]:
+    """Brute force random text with unicode characters"""
+
+    NUM_WORDS = 200
+    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
+
+    characters = list(generator_unicodes())
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        for _ in range(NUM_WORDS):
+            k = rand.randint(1, 7)
+            word = rand.choices(characters, k=k)
+            word.append(rand.choice(WHITESPACES))
+            text.append("".join(word))
+        yield "".join(text)
+
+
+def generator_random_vocab_chars(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
+    """Brute force random text with vocab characters"""
+
+    vocab_chars = set()
+    for word in tokenizer.vocab:
+        vocab_chars.update(word)
+    vocab_chars = list(sorted(vocab_chars))
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = rand.choices(vocab_chars, k=1024)
+        yield "".join(text)
+
+
+def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
+    """Brute force random text from vocab words"""
+
+    vocab = [w.strip() for w in tokenizer.vocab]
+    yield from vocab
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        num_words = rand.randint(300, 400)
+        for i in range(num_words):
+            k = rand.randint(1, 3)
+            words = rand.choices(vocab, k=k)
+            sep = rand.choice("     \n\r\t")
+            text.append("".join(words) + sep)
+        yield "".join(text)
+
+
+def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):
+
+    def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
+        for i, (a, b) in enumerate(zip(ids1, ids2)):
+            if a != b:
+                return i
+        if len(ids1) == len(ids2):
+            return -1
+        return min(len(ids1), len(ids2))
+
+    def check_detokenizer(text: str, text1: str, text2: str) -> bool:
+        if text1 == text2:  # equal to TokenizerGroundtruth?
+            return True
+        # equal to source text?
+        if tokenizer1.add_bos_token and tokenizer1.bos_token and isinstance(tokenizer1.bos_token, str):  # remove BOS
+            if text2.startswith(tokenizer1.bos_token):
+                text2 = text2[len(tokenizer1.bos_token):]
+        if tokenizer1.add_eos_token and tokenizer1.eos_token and isinstance(tokenizer1.eos_token, str):  # remove EOS
+            if text2.endswith(tokenizer1.eos_token):
+                text2 = text2[:-len(tokenizer1.eos_token)]
+        return text == text2
+
+    t_encode1 = 0
+    t_encode2 = 0
+    t_decode1 = 0
+    t_decode2 = 0
+    t_start = time.perf_counter()
+    encode_errors = 0
+    decode_errors = 0
+    MAX_ERRORS = 10
+
+    logger.info("%s: %s" % (generator.__qualname__, "ini"))
+    for text in generator:
+        # print(repr(text), text.encode())
+        # print(repr(text), hex(ord(text[0])), text.encode())
+        t0 = time.perf_counter()
+        ids1 = tokenizer1.encode(text)
+        t1 = time.perf_counter()
+        ids2 = tokenizer2.encode(text)
+        t2 = time.perf_counter()
+        text1 = tokenizer1.decode(ids1)
+        t3 = time.perf_counter()
+        text2 = tokenizer2.decode(ids1)
+        t4 = time.perf_counter()
+        t_encode1 += t1 - t0
+        t_encode2 += t2 - t1
+        t_decode1 += t3 - t2
+        t_decode2 += t4 - t3
+        if encode_errors < MAX_ERRORS and ids1 != ids2:
+            i = find_first_mismatch(ids1, ids2)
+            ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
+            ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
+            logger.error(" Expected: " + str(ids1))
+            logger.error("   Result: " + str(ids2))
+            encode_errors += 1
+            logger.error(f" {encode_errors=}")
+        if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
+            i = find_first_mismatch(text1, text2)
+            text1 = list(text1[max(0, i - 2) : i + 5 + 1])
+            text2 = list(text2[max(0, i - 2) : i + 5 + 1])
+            logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1))
+            logger.error("   Result: " + " ".join(hex(ord(x)) for x in text2))
+            decode_errors += 1
+            logger.error(f" {decode_errors=}")
+        if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
+            logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
+            # raise Exception()
+            break
+
+    t_total = time.perf_counter() - t_start
+    logger.info(f"{generator.__qualname__}: end,  {t_encode1=:.3f} {t_encode2=:.3f}  {t_decode1=:.3f} {t_decode2=:.3f}  {t_total=:.3f}")
+
+
+def main(argv: list[str] | None = None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file")
+    parser.add_argument("dir_tokenizer", type=str, help="directory containing 'tokenizer.model' file")
+    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
+    args = parser.parse_args(argv)
+
+    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO)
+    logger.info(f"VOCABFILE: '{args.vocab_file}'")
+
+    tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
+    tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
+
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
+
+    tokenizer2.model.free()
+
+
+if __name__ == "__main__":
+    # main()
+
+    if True:
+        logging.basicConfig(
+            level    = logging.DEBUG,
+            format   = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
+            datefmt  = "%Y-%m-%d %H:%M:%S",
+            filename = logger.name + ".log",
+            filemode = "a"
+        )
+    logging.basicConfig(
+        level    = logging.DEBUG,
+        format   = "%(levelname)s %(message)s",
+    )
+
+    path_tokenizers   = Path("./models/tokenizers/")
+    path_vocab_format = "./models/ggml-vocab-%s.gguf"
+
+    tokenizers = [
+        "llama-spm",      # SPM
+        "phi-3",          # SPM
+        "gemma",          # SPM
+        "gemma-2",        # SPM
+        "baichuan",       # SPM
+        "bert-bge",       # WPM
+        "jina-v2-en",     # WPM
+        "llama-bpe",      # BPE
+        "phi-2",          # BPE
+        "deepseek-llm",   # BPE
+        "deepseek-coder", # BPE
+        "falcon",         # BPE
+        "mpt",            # BPE
+        "starcoder",      # BPE
+        "gpt-2",          # BPE
+        "stablelm2",      # BPE
+        "refact",         # BPE
+        "qwen2",          # BPE
+        "olmo",           # BPE
+        "jina-v2-es",     # BPE
+        "jina-v2-de",     # BPE
+        "smaug-bpe",      # BPE
+        "poro-chat",      # BPE
+        "jina-v2-code",   # BPE
+        "viking",         # BPE
+        "jais",           # BPE
+    ]
+
+    logger.info("=" * 50)
+    for tokenizer in tokenizers:
+        logger.info("-" * 50)
+        logger.info(f"TOKENIZER: '{tokenizer}'")
+        vocab_file = Path(path_vocab_format % tokenizer)
+        dir_tokenizer = path_tokenizers / tokenizer
+        main([str(vocab_file), str(dir_tokenizer), "--verbose"])
diff --git a/llama.cpp/tests/test-tokenizers-repo.sh b/llama.cpp/tests/test-tokenizers-repo.sh
new file mode 100755
index 0000000..94a3d05
--- /dev/null
+++ b/llama.cpp/tests/test-tokenizers-repo.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+if [ $# -lt 2 ]; then
+    printf "Usage: $0 <git-repo> <target-folder> [<test-exe>]\n"
+    exit 1
+fi
+
+if [ $# -eq 3 ]; then
+    toktest=$3
+else
+    toktest="./test-tokenizer-0"
+fi
+
+if [ ! -x $toktest ]; then
+    printf "Test executable \"$toktest\" not found!\n"
+    exit 1
+fi
+
+repo=$1
+folder=$2
+
+if [ -d $folder ] && [ -d $folder/.git ]; then
+    (cd $folder; git pull)
+else
+    git clone $repo $folder
+
+    # byteswap models if on big endian
+    if [ "$(uname -m)" = s390x ]; then
+        for f in $folder/*/*.gguf; do
+            echo YES | python3 "$(dirname $0)/../gguf-py/gguf/scripts/gguf_convert_endian.py" $f big
+        done
+    fi
+fi
+
+shopt -s globstar
+for gguf in $folder/**/*.gguf; do
+    if [ -f $gguf.inp ] && [ -f $gguf.out ]; then
+        $toktest $gguf
+    else
+        printf "Found \"$gguf\" without matching inp/out files, ignoring...\n"
+    fi
+done
+
diff --git a/llama.cpp/tests/testing.h b/llama.cpp/tests/testing.h
new file mode 100644
index 0000000..7949483
--- /dev/null
+++ b/llama.cpp/tests/testing.h
@@ -0,0 +1,243 @@
+#pragma once
+
+#include "common.h"
+
+#include <chrono>
+#include <exception>
+#include <iostream>
+#include <string>
+#include <regex>
+#include <vector>
+
+struct testing {
+    std::ostream &out;
+    std::vector<std::string> stack;
+    std::regex filter;
+    bool filter_tests = false;
+    bool throw_exception = false;
+    bool verbose = false;
+    int tests = 0;
+    int assertions = 0;
+    int failures = 0;
+    int unnamed = 0;
+    int exceptions = 0;
+
+    static constexpr std::size_t status_column = 80;
+
+    explicit testing(std::ostream &os = std::cout) : out(os) {}
+
+    std::string indent() const {
+        if (stack.empty()) {
+            return "";
+        }
+        return std::string((stack.size() - 1) * 2, ' ');
+    }
+
+    std::string full_name() const {
+        return string_join(stack, ".");
+    }
+
+    void log(const std::string & msg) {
+        if (verbose) {
+            out << indent() << "  " << msg << "\n";
+        }
+    }
+
+    void set_filter(const std::string & re) {
+        filter = std::regex(re);
+        filter_tests = true;
+    }
+
+    bool should_run() const {
+        if (filter_tests) {
+            if (!std::regex_match(full_name(), filter)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    template <typename F>
+    void run_with_exceptions(F &&f, const char *ctx) {
+        try {
+            f();
+        } catch (const std::exception &e) {
+            ++failures;
+            ++exceptions;
+            out << indent() << "UNHANDLED EXCEPTION (" << ctx << "): " << e.what() << "\n";
+            if (throw_exception) {
+                throw;
+            }
+        } catch (...) {
+            ++failures;
+            ++exceptions;
+            out << indent() << "UNHANDLED EXCEPTION (" << ctx << "): unknown\n";
+            if (throw_exception) {
+                throw;
+            }
+        }
+    }
+
+    void print_result(const std::string &label, int new_failures, int new_assertions, const std::string &extra = "") const {
+        std::string line = indent() + label;
+
+        std::string details;
+        if (new_assertions > 0) {
+            if (new_failures == 0) {
+                details = std::to_string(new_assertions) + " assertion(s)";
+            } else {
+                details = std::to_string(new_failures) + " of " +
+                          std::to_string(new_assertions) + " assertion(s) failed";
+            }
+        }
+        if (!extra.empty()) {
+            if (!details.empty()) {
+                details += ", ";
+            }
+            details += extra;
+        }
+
+        if (!details.empty()) {
+            line += " (" + details + ")";
+        }
+
+        std::string status = (new_failures == 0) ? "[PASS]" : "[FAIL]";
+
+        if (line.size() + 1 < status_column) {
+            line.append(status_column - line.size(), ' ');
+        } else {
+            line.push_back(' ');
+        }
+
+        out << line << status << "\n";
+    }
+
+    template <typename F>
+    void test(const std::string &name, F f) {
+        stack.push_back(name);
+        if (!should_run()) {
+            stack.pop_back();
+            return;
+        }
+
+        ++tests;
+        out << indent() << name << "\n";
+
+        int before_failures   = failures;
+        int before_assertions = assertions;
+
+        run_with_exceptions([&] { f(*this); }, "test");
+
+        int new_failures   = failures   - before_failures;
+        int new_assertions = assertions - before_assertions;
+
+        print_result(name, new_failures, new_assertions);
+
+        stack.pop_back();
+    }
+
+    template <typename F>
+    void test(F f) {
+        test("test #" + std::to_string(++unnamed), f);
+    }
+
+    template <typename F>
+    void bench(const std::string &name, F f, int iterations = 100) {
+        stack.push_back(name);
+        if (!should_run()) {
+            stack.pop_back();
+            return;
+        }
+
+        ++tests;
+        out << indent() << "[bench] " << name << "\n";
+
+        int before_failures   = failures;
+        int before_assertions = assertions;
+
+        using clock = std::chrono::high_resolution_clock;
+
+        std::chrono::microseconds duration(0);
+
+        run_with_exceptions([&] {
+            for (auto i = 0; i < iterations; i++) {
+                auto start = clock::now();
+                f();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - start);
+            }
+        }, "bench");
+
+        auto avg_elapsed   = duration.count() / iterations;
+        auto avg_elapsed_s = std::chrono::duration_cast<std::chrono::duration<double>>(duration).count() / iterations;
+        auto rate = (avg_elapsed_s > 0.0) ? (1.0 / avg_elapsed_s) : 0.0;
+
+        int new_failures   = failures   - before_failures;
+        int new_assertions = assertions - before_assertions;
+
+        std::string extra =
+            "n=" + std::to_string(iterations) +
+            " avg=" + std::to_string(avg_elapsed) + "us" +
+            " rate=" + std::to_string(int(rate)) + "/s";
+
+        print_result("[bench] " + name, new_failures, new_assertions, extra);
+
+        stack.pop_back();
+    }
+
+    template <typename F>
+    void bench(F f, int iterations = 100) {
+        bench("bench #" + std::to_string(++unnamed), f, iterations);
+    }
+
+    // Assertions
+    bool assert_true(bool cond) {
+        return assert_true("", cond);
+    }
+
+    bool assert_true(const std::string &msg, bool cond) {
+        ++assertions;
+        if (!cond) {
+            ++failures;
+            out << indent() << "ASSERTION FAILED";
+            if (!msg.empty()) {
+                out << " : " << msg;
+            }
+            out << "\n";
+            return false;
+        }
+        return true;
+    }
+
+    template <typename A, typename B>
+    bool assert_equal(const A &expected, const B &actual) {
+        return assert_equal("", expected, actual);
+    }
+
+    template <typename A, typename B>
+    bool assert_equal(const std::string &msg, const A &expected, const B &actual) {
+        ++assertions;
+        if (!(actual == expected)) {
+            ++failures;
+            out << indent() << "ASSERT EQUAL FAILED";
+            if (!msg.empty()) {
+                out << " : " << msg;
+            }
+            out << "\n";
+
+            out << indent() << "  expected: " << expected << "\n";
+            out << indent() << "  actual  : " << actual << "\n";
+            return false;
+        }
+        return true;
+    }
+
+    // Print summary and return an exit code
+    int summary() const {
+        out << "\n";
+        out << "tests      : " << tests << "\n";
+        out << "assertions : " << assertions << "\n";
+        out << "failures   : " << failures << "\n";
+        out << "exceptions : " << exceptions << "\n";
+        return failures == 0 ? 0 : 1;
+    }
+};
-- 
cgit v1.2.3