From b333b06772c89d96aacb5490d6a219fba7c09cc6 Mon Sep 17 00:00:00 2001
From: Mitja Felicijan <mitja.felicijan@gmail.com>
Date: Thu, 12 Feb 2026 20:57:17 +0100
Subject: Engage!

---
 llama.cpp/tools/CMakeLists.txt                     |   40 +
 llama.cpp/tools/batched-bench/CMakeLists.txt       |    8 +
 llama.cpp/tools/batched-bench/README.md            |   60 +
 llama.cpp/tools/batched-bench/batched-bench.cpp    |  256 +
 llama.cpp/tools/cli/CMakeLists.txt                 |   10 +
 llama.cpp/tools/cli/README.md                      |  192 +
 llama.cpp/tools/cli/cli.cpp                        |  421 +
 llama.cpp/tools/completion/CMakeLists.txt          |    8 +
 llama.cpp/tools/completion/README.md               |  578 ++
 llama.cpp/tools/completion/completion.cpp          | 1001 +++
 llama.cpp/tools/cvector-generator/CMakeLists.txt   |    8 +
 llama.cpp/tools/cvector-generator/README.md        |   45 +
 llama.cpp/tools/cvector-generator/completions.txt  |  582 ++
 .../tools/cvector-generator/cvector-generator.cpp  |  508 ++
 llama.cpp/tools/cvector-generator/mean.hpp         |   48 +
 llama.cpp/tools/cvector-generator/negative.txt     |    4 +
 llama.cpp/tools/cvector-generator/pca.hpp          |  315 +
 llama.cpp/tools/cvector-generator/positive.txt     |    4 +
 llama.cpp/tools/export-lora/CMakeLists.txt         |    8 +
 llama.cpp/tools/export-lora/README.md              |   33 +
 llama.cpp/tools/export-lora/export-lora.cpp        |  434 +
 llama.cpp/tools/fit-params/CMakeLists.txt          |    8 +
 llama.cpp/tools/fit-params/README.md               |   55 +
 llama.cpp/tools/fit-params/fit-params.cpp          |   66 +
 llama.cpp/tools/gguf-split/CMakeLists.txt          |    8 +
 llama.cpp/tools/gguf-split/README.md               |   10 +
 llama.cpp/tools/gguf-split/gguf-split.cpp          |  583 ++
 llama.cpp/tools/gguf-split/tests.sh                |   89 +
 llama.cpp/tools/imatrix/CMakeLists.txt             |   13 +
 llama.cpp/tools/imatrix/README.md                  |   98 +
 llama.cpp/tools/imatrix/imatrix.cpp                | 1302 +++
 llama.cpp/tools/llama-bench/CMakeLists.txt         |    8 +
 llama.cpp/tools/llama-bench/README.md              |  349 +
 llama.cpp/tools/llama-bench/llama-bench.cpp        | 2291 +++++
 llama.cpp/tools/mtmd/CMakeLists.txt                |   96 +
 llama.cpp/tools/mtmd/README.md                     |   63 +
 llama.cpp/tools/mtmd/clip-graph.h                  |  117 +
 llama.cpp/tools/mtmd/clip-impl.h                   |  582 ++
 llama.cpp/tools/mtmd/clip-model.h                  |  389 +
 llama.cpp/tools/mtmd/clip.cpp                      | 4080 +++++++++
 llama.cpp/tools/mtmd/clip.h                        |  121 +
 llama.cpp/tools/mtmd/deprecation-warning.cpp       |   22 +
 .../legacy-models/convert_image_encoder_to_gguf.py |  412 +
 .../glmedge-convert-image-encoder-to-gguf.py       |  280 +
 .../tools/mtmd/legacy-models/glmedge-surgery.py    |   33 +
 .../tools/mtmd/legacy-models/llava_surgery.py      |   38 +
 .../tools/mtmd/legacy-models/llava_surgery_v2.py   |  180 +
 .../minicpmv-convert-image-encoder-to-gguf.py      |  892 ++
 .../tools/mtmd/legacy-models/minicpmv-surgery.py   |   47 +
 llama.cpp/tools/mtmd/models/cogvlm.cpp             |   98 +
 llama.cpp/tools/mtmd/models/conformer.cpp          |  216 +
 llama.cpp/tools/mtmd/models/glm4v.cpp              |  120 +
 llama.cpp/tools/mtmd/models/internvl.cpp           |   69 +
 llama.cpp/tools/mtmd/models/kimik25.cpp            |  101 +
 llama.cpp/tools/mtmd/models/kimivl.cpp             |   63 +
 llama.cpp/tools/mtmd/models/llama4.cpp             |   96 +
 llama.cpp/tools/mtmd/models/llava.cpp              |  374 +
 llama.cpp/tools/mtmd/models/minicpmv.cpp           |  114 +
 llama.cpp/tools/mtmd/models/mobilenetv5.cpp        |  451 +
 llama.cpp/tools/mtmd/models/models.h               |  118 +
 llama.cpp/tools/mtmd/models/pixtral.cpp            |   86 +
 llama.cpp/tools/mtmd/models/qwen2vl.cpp            |  183 +
 llama.cpp/tools/mtmd/models/qwen3vl.cpp            |  193 +
 llama.cpp/tools/mtmd/models/siglip.cpp             |   86 +
 llama.cpp/tools/mtmd/models/whisper-enc.cpp        |  115 +
 llama.cpp/tools/mtmd/models/youtuvl.cpp            |  179 +
 llama.cpp/tools/mtmd/mtmd-audio.cpp                |  730 ++
 llama.cpp/tools/mtmd/mtmd-audio.h                  |  113 +
 llama.cpp/tools/mtmd/mtmd-cli.cpp                  |  437 +
 llama.cpp/tools/mtmd/mtmd-helper.cpp               |  521 ++
 llama.cpp/tools/mtmd/mtmd-helper.h                 |   96 +
 llama.cpp/tools/mtmd/mtmd.cpp                      | 1151 +++
 llama.cpp/tools/mtmd/mtmd.h                        |  319 +
 llama.cpp/tools/mtmd/requirements.txt              |    5 +
 llama.cpp/tools/mtmd/test-1.jpeg                   |  Bin 0 -> 124071 bytes
 llama.cpp/tools/mtmd/test-2.mp3                    |  Bin 0 -> 140060 bytes
 llama.cpp/tools/mtmd/tests.sh                      |  183 +
 llama.cpp/tools/perplexity/CMakeLists.txt          |    8 +
 llama.cpp/tools/perplexity/README.md               |  193 +
 llama.cpp/tools/perplexity/perplexity.cpp          | 2070 +++++
 llama.cpp/tools/quantize/CMakeLists.txt            |    9 +
 llama.cpp/tools/quantize/README.md                 |  171 +
 llama.cpp/tools/quantize/quantize.cpp              |  733 ++
 llama.cpp/tools/quantize/tests.sh                  |   65 +
 llama.cpp/tools/rpc/CMakeLists.txt                 |    8 +
 llama.cpp/tools/rpc/README.md                      |  104 +
 llama.cpp/tools/rpc/rpc-server.cpp                 |  336 +
 llama.cpp/tools/server/CMakeLists.txt              |   70 +
 llama.cpp/tools/server/README-dev.md               |  179 +
 llama.cpp/tools/server/README.md                   | 1782 ++++
 llama.cpp/tools/server/bench/README.md             |  119 +
 llama.cpp/tools/server/bench/bench.py              |  322 +
 llama.cpp/tools/server/bench/prometheus.yml        |    9 +
 llama.cpp/tools/server/bench/requirements.txt      |    2 +
 llama.cpp/tools/server/bench/script.js             |  162 +
 llama.cpp/tools/server/chat-llama2.sh              |  109 +
 llama.cpp/tools/server/chat.mjs                    |  131 +
 llama.cpp/tools/server/chat.sh                     |   80 +
 llama.cpp/tools/server/public/index.html.gz        |  Bin 0 -> 1453103 bytes
 llama.cpp/tools/server/public/loading.html         |   12 +
 .../tools/server/public_legacy/colorthemes.css     |  402 +
 llama.cpp/tools/server/public_legacy/completion.js |  209 +
 llama.cpp/tools/server/public_legacy/favicon.ico   |  Bin 0 -> 4122 bytes
 .../tools/server/public_legacy/index-new.html      | 1190 +++
 llama.cpp/tools/server/public_legacy/index.html    | 1301 +++
 llama.cpp/tools/server/public_legacy/index.js      |    1 +
 .../public_legacy/json-schema-to-grammar.mjs       |  856 ++
 llama.cpp/tools/server/public_legacy/loading.html  |   12 +
 .../tools/server/public_legacy/prompt-formats.js   |  331 +
 llama.cpp/tools/server/public_legacy/style.css     |  954 ++
 .../tools/server/public_legacy/system-prompts.js   |   68 +
 .../server/public_legacy/theme-beeninorder.css     |  228 +
 .../tools/server/public_legacy/theme-ketivah.css   |  201 +
 .../server/public_legacy/theme-mangotango.css      |  216 +
 .../server/public_legacy/theme-playground.css      |  221 +
 .../server/public_legacy/theme-polarnight.css      |  253 +
 .../tools/server/public_legacy/theme-snowstorm.css |  251 +
 .../tools/server/public_simplechat/datautils.mjs   |  266 +
 .../tools/server/public_simplechat/index.html      |   51 +
 llama.cpp/tools/server/public_simplechat/readme.md |  286 +
 .../tools/server/public_simplechat/simplechat.css  |   79 +
 .../tools/server/public_simplechat/simplechat.js   |  929 ++
 .../public_simplechat/simplechat_screens.webp      |  Bin 0 -> 21376 bytes
 llama.cpp/tools/server/public_simplechat/ui.mjs    |  211 +
 llama.cpp/tools/server/server-common.cpp           | 1980 +++++
 llama.cpp/tools/server/server-common.h             |  366 +
 llama.cpp/tools/server/server-context.cpp          | 4105 +++++++++
 llama.cpp/tools/server/server-context.h            |  131 +
 llama.cpp/tools/server/server-http.cpp             |  406 +
 llama.cpp/tools/server/server-http.h               |   78 +
 llama.cpp/tools/server/server-models.cpp           | 1092 +++
 llama.cpp/tools/server/server-models.h             |  203 +
 llama.cpp/tools/server/server-queue.cpp            |  450 +
 llama.cpp/tools/server/server-queue.h              |  197 +
 llama.cpp/tools/server/server-task.cpp             | 2005 +++++
 llama.cpp/tools/server/server-task.h               |  620 ++
 llama.cpp/tools/server/server.cpp                  |  322 +
 llama.cpp/tools/server/tests/.gitignore            |    2 +
 llama.cpp/tools/server/tests/README.md             |   96 +
 llama.cpp/tools/server/tests/conftest.py           |   21 +
 llama.cpp/tools/server/tests/pytest.ini            |    4 +
 llama.cpp/tools/server/tests/requirements.txt      |    8 +
 llama.cpp/tools/server/tests/tests.sh              |   23 +
 llama.cpp/tools/server/tests/unit/test_basic.py    |   96 +
 .../server/tests/unit/test_chat_completion.py      |  512 ++
 .../server/tests/unit/test_compat_anthropic.py     |  896 ++
 .../server/tests/unit/test_compat_oai_responses.py |   73 +
 .../tools/server/tests/unit/test_completion.py     |  608 ++
 .../tools/server/tests/unit/test_ctx_shift.py      |   89 +
 .../tools/server/tests/unit/test_embedding.py      |  257 +
 llama.cpp/tools/server/tests/unit/test_infill.py   |   77 +
 llama.cpp/tools/server/tests/unit/test_lora.py     |  115 +
 llama.cpp/tools/server/tests/unit/test_rerank.py   |  146 +
 llama.cpp/tools/server/tests/unit/test_router.py   |  194 +
 llama.cpp/tools/server/tests/unit/test_security.py |  127 +
 llama.cpp/tools/server/tests/unit/test_sleep.py    |   39 +
 .../tools/server/tests/unit/test_slot_save.py      |   98 +
 .../tools/server/tests/unit/test_speculative.py    |  131 +
 llama.cpp/tools/server/tests/unit/test_template.py |  105 +
 llama.cpp/tools/server/tests/unit/test_tokenize.py |   59 +
 .../tools/server/tests/unit/test_tool_call.py      |  625 ++
 .../tools/server/tests/unit/test_vision_api.py     |  160 +
 llama.cpp/tools/server/tests/utils.py              |  643 ++
 llama.cpp/tools/server/themes/README.md            |    5 +
 .../tools/server/themes/buttons-top/README.md      |    7 +
 .../server/themes/buttons-top/buttons_top.png      |  Bin 0 -> 119747 bytes
 .../tools/server/themes/buttons-top/favicon.ico    |  Bin 0 -> 4122 bytes
 .../tools/server/themes/buttons-top/index.html     | 1052 +++
 llama.cpp/tools/server/themes/wild/README.md       |    5 +
 llama.cpp/tools/server/themes/wild/favicon.ico     |  Bin 0 -> 4122 bytes
 llama.cpp/tools/server/themes/wild/index.html      | 1056 +++
 llama.cpp/tools/server/themes/wild/llama_cpp.png   |  Bin 0 -> 76484 bytes
 .../tools/server/themes/wild/llamapattern.png      |  Bin 0 -> 259586 bytes
 llama.cpp/tools/server/themes/wild/wild.png        |  Bin 0 -> 496463 bytes
 llama.cpp/tools/server/webui/.gitignore            |   28 +
 llama.cpp/tools/server/webui/.npmrc                |    1 +
 llama.cpp/tools/server/webui/.prettierignore       |    9 +
 llama.cpp/tools/server/webui/.prettierrc           |   16 +
 .../webui/.storybook/ModeWatcherDecorator.svelte   |   36 +
 .../.storybook/TooltipProviderDecorator.svelte     |   13 +
 llama.cpp/tools/server/webui/.storybook/main.ts    |   17 +
 llama.cpp/tools/server/webui/.storybook/preview.ts |   42 +
 .../tools/server/webui/.storybook/vitest.setup.ts  |   12 +
 llama.cpp/tools/server/webui/README.md             |  687 ++
 llama.cpp/tools/server/webui/components.json       |   16 +
 .../high-level-architecture-simplified.md          |  106 +
 .../docs/architecture/high-level-architecture.md   |  279 +
 .../tools/server/webui/docs/flows/chat-flow.md     |  174 +
 .../server/webui/docs/flows/conversations-flow.md  |  155 +
 .../docs/flows/data-flow-simplified-model-mode.md  |   45 +
 .../docs/flows/data-flow-simplified-router-mode.md |   77 +
 .../tools/server/webui/docs/flows/database-flow.md |  155 +
 .../tools/server/webui/docs/flows/models-flow.md   |  181 +
 .../tools/server/webui/docs/flows/server-flow.md   |   76 +
 .../tools/server/webui/docs/flows/settings-flow.md |  144 +
 llama.cpp/tools/server/webui/eslint.config.js      |   49 +
 llama.cpp/tools/server/webui/package-lock.json     | 9343 ++++++++++++++++++++
 llama.cpp/tools/server/webui/package.json          |   94 +
 llama.cpp/tools/server/webui/playwright.config.ts  |   11 +
 llama.cpp/tools/server/webui/scripts/dev.sh        |   57 +
 .../server/webui/scripts/install-git-hooks.sh      |  202 +
 llama.cpp/tools/server/webui/scripts/post-build.sh |    3 +
 llama.cpp/tools/server/webui/src/app.css           |  138 +
 llama.cpp/tools/server/webui/src/app.d.ts          |  133 +
 llama.cpp/tools/server/webui/src/app.html          |   12 +
 .../ChatAttachments/ChatAttachmentPreview.svelte   |  283 +
 .../ChatAttachmentThumbnailFile.svelte             |  165 +
 .../ChatAttachmentThumbnailImage.svelte            |   64 +
 .../ChatAttachments/ChatAttachmentsList.svelte     |  243 +
 .../ChatAttachments/ChatAttachmentsViewAll.svelte  |  117 +
 .../components/app/chat/ChatForm/ChatForm.svelte   |  315 +
 .../ChatFormActionFileAttachments.svelte           |  123 +
 .../ChatFormActions/ChatFormActionRecord.svelte    |   52 +
 .../ChatFormActions/ChatFormActionSubmit.svelte    |   55 +
 .../ChatFormActions/ChatFormActions.svelte         |  204 +
 .../ChatForm/ChatFormFileInputInvisible.svelte     |   30 +
 .../app/chat/ChatForm/ChatFormHelperText.svelte    |   17 +
 .../app/chat/ChatForm/ChatFormTextarea.svelte      |   59 +
 .../app/chat/ChatMessages/ChatMessage.svelte       |  286 +
 .../chat/ChatMessages/ChatMessageActions.svelte    |  100 +
 .../chat/ChatMessages/ChatMessageAssistant.svelte  |  418 +
 .../ChatMessageBranchingControls.svelte            |   84 +
 .../chat/ChatMessages/ChatMessageEditForm.svelte   |  391 +
 .../chat/ChatMessages/ChatMessageStatistics.svelte |  175 +
 .../app/chat/ChatMessages/ChatMessageSystem.svelte |  216 +
 .../ChatMessages/ChatMessageThinkingBlock.svelte   |   68 +
 .../app/chat/ChatMessages/ChatMessageUser.svelte   |  163 +
 .../app/chat/ChatMessages/ChatMessages.svelte      |  143 +
 .../app/chat/ChatScreen/ChatScreen.svelte          |  617 ++
 .../chat/ChatScreen/ChatScreenDragOverlay.svelte   |   17 +
 .../app/chat/ChatScreen/ChatScreenHeader.svelte    |   28 +
 .../ChatScreen/ChatScreenProcessingInfo.svelte     |  120 +
 .../app/chat/ChatSettings/ChatSettings.svelte      |  508 ++
 .../chat/ChatSettings/ChatSettingsFields.svelte    |  255 +
 .../chat/ChatSettings/ChatSettingsFooter.svelte    |   59 +
 .../ChatSettingsImportExportTab.svelte             |  317 +
 .../ChatSettingsParameterSourceIndicator.svelte    |   18 +
 .../app/chat/ChatSidebar/ChatSidebar.svelte        |  211 +
 .../app/chat/ChatSidebar/ChatSidebarActions.svelte |   81 +
 .../ChatSidebar/ChatSidebarConversationItem.svelte |  200 +
 .../app/chat/ChatSidebar/ChatSidebarSearch.svelte  |   19 +
 .../handle-mobile-sidebar-item-click.ts            |    9 +
 .../app/dialogs/DialogChatAttachmentPreview.svelte |   67 +
 .../dialogs/DialogChatAttachmentsViewAll.svelte    |   54 +
 .../components/app/dialogs/DialogChatError.svelte  |   70 +
 .../app/dialogs/DialogChatSettings.svelte          |   37 +
 .../app/dialogs/DialogConfirmation.svelte          |   72 +
 .../app/dialogs/DialogConversationSelection.svelte |   68 +
 .../dialogs/DialogConversationTitleUpdate.svelte   |   46 +
 .../app/dialogs/DialogEmptyFileAlert.svelte        |   61 +
 .../app/dialogs/DialogModelInformation.svelte      |  211 +
 .../app/dialogs/DialogModelNotAvailable.svelte     |   76 +
 .../server/webui/src/lib/components/app/index.ts   |   75 +
 .../lib/components/app/misc/ActionButton.svelte    |   47 +
 .../lib/components/app/misc/ActionDropdown.svelte  |   86 +
 .../components/app/misc/BadgeChatStatistic.svelte  |   44 +
 .../src/lib/components/app/misc/BadgeInfo.svelte   |   27 +
 .../lib/components/app/misc/BadgeModality.svelte   |   39 +
 .../components/app/misc/CodePreviewDialog.svelte   |   93 +
 .../app/misc/ConversationSelection.svelte          |  205 +
 .../components/app/misc/CopyToClipboardIcon.svelte |   18 +
 .../app/misc/KeyboardShortcutInfo.svelte           |   31 +
 .../lib/components/app/misc/MarkdownContent.svelte |  870 ++
 .../lib/components/app/misc/RemoveButton.svelte    |   26 +
 .../src/lib/components/app/misc/SearchInput.svelte |   73 +
 .../app/misc/SyntaxHighlightedCode.svelte          |   97 +
 .../lib/components/app/models/ModelBadge.svelte    |   56 +
 .../components/app/models/ModelsSelector.svelte    |  555 ++
 .../components/app/server/ServerErrorSplash.svelte |  282 +
 .../app/server/ServerLoadingSplash.svelte          |   33 +
 .../lib/components/app/server/ServerStatus.svelte  |   65 +
 .../ui/alert-dialog/alert-dialog-action.svelte     |   18 +
 .../ui/alert-dialog/alert-dialog-cancel.svelte     |   18 +
 .../ui/alert-dialog/alert-dialog-content.svelte    |   35 +
 .../alert-dialog/alert-dialog-description.svelte   |   17 +
 .../ui/alert-dialog/alert-dialog-footer.svelte     |   23 +
 .../ui/alert-dialog/alert-dialog-header.svelte     |   20 +
 .../ui/alert-dialog/alert-dialog-overlay.svelte    |   20 +
 .../ui/alert-dialog/alert-dialog-title.svelte      |   17 +
 .../ui/alert-dialog/alert-dialog-trigger.svelte    |    7 +
 .../src/lib/components/ui/alert-dialog/index.ts    |   39 +
 .../components/ui/alert/alert-description.svelte   |   23 +
 .../src/lib/components/ui/alert/alert-title.svelte |   20 +
 .../webui/src/lib/components/ui/alert/alert.svelte |   44 +
 .../webui/src/lib/components/ui/alert/index.ts     |   14 +
 .../webui/src/lib/components/ui/badge/badge.svelte |   49 +
 .../webui/src/lib/components/ui/badge/index.ts     |    2 +
 .../src/lib/components/ui/button/button.svelte     |   87 +
 .../webui/src/lib/components/ui/button/index.ts    |   17 +
 .../src/lib/components/ui/card/card-action.svelte  |   20 +
 .../src/lib/components/ui/card/card-content.svelte |   15 +
 .../lib/components/ui/card/card-description.svelte |   20 +
 .../src/lib/components/ui/card/card-footer.svelte  |   20 +
 .../src/lib/components/ui/card/card-header.svelte  |   23 +
 .../src/lib/components/ui/card/card-title.svelte   |   20 +
 .../webui/src/lib/components/ui/card/card.svelte   |   23 +
 .../webui/src/lib/components/ui/card/index.ts      |   25 +
 .../src/lib/components/ui/checkbox/checkbox.svelte |   36 +
 .../webui/src/lib/components/ui/checkbox/index.ts  |    6 +
 .../ui/collapsible/collapsible-content.svelte      |    7 +
 .../ui/collapsible/collapsible-trigger.svelte      |    7 +
 .../components/ui/collapsible/collapsible.svelte   |   11 +
 .../src/lib/components/ui/collapsible/index.ts     |   13 +
 .../lib/components/ui/dialog/dialog-close.svelte   |    7 +
 .../lib/components/ui/dialog/dialog-content.svelte |   43 +
 .../components/ui/dialog/dialog-description.svelte |   17 +
 .../lib/components/ui/dialog/dialog-footer.svelte  |   20 +
 .../lib/components/ui/dialog/dialog-header.svelte  |   20 +
 .../lib/components/ui/dialog/dialog-overlay.svelte |   20 +
 .../lib/components/ui/dialog/dialog-title.svelte   |   17 +
 .../lib/components/ui/dialog/dialog-trigger.svelte |    7 +
 .../webui/src/lib/components/ui/dialog/index.ts    |   37 +
 .../dropdown-menu-checkbox-item.svelte             |   41 +
 .../ui/dropdown-menu/dropdown-menu-content.svelte  |   27 +
 .../dropdown-menu-group-heading.svelte             |   22 +
 .../ui/dropdown-menu/dropdown-menu-group.svelte    |    7 +
 .../ui/dropdown-menu/dropdown-menu-item.svelte     |   27 +
 .../ui/dropdown-menu/dropdown-menu-label.svelte    |   24 +
 .../dropdown-menu/dropdown-menu-radio-group.svelte |   16 +
 .../dropdown-menu/dropdown-menu-radio-item.svelte  |   31 +
 .../dropdown-menu/dropdown-menu-separator.svelte   |   17 +
 .../ui/dropdown-menu/dropdown-menu-shortcut.svelte |   20 +
 .../dropdown-menu/dropdown-menu-sub-content.svelte |   20 +
 .../dropdown-menu/dropdown-menu-sub-trigger.svelte |   29 +
 .../ui/dropdown-menu/dropdown-menu-trigger.svelte  |    7 +
 .../src/lib/components/ui/dropdown-menu/index.ts   |   49 +
 .../webui/src/lib/components/ui/input/index.ts     |    7 +
 .../webui/src/lib/components/ui/input/input.svelte |   51 +
 .../webui/src/lib/components/ui/label/index.ts     |    7 +
 .../webui/src/lib/components/ui/label/label.svelte |   20 +
 .../webui/src/lib/components/ui/popover/index.ts   |   19 +
 .../lib/components/ui/popover/popover-close.svelte |    7 +
 .../components/ui/popover/popover-content.svelte   |   37 +
 .../components/ui/popover/popover-portal.svelte    |    7 +
 .../components/ui/popover/popover-trigger.svelte   |   17 +
 .../src/lib/components/ui/popover/popover.svelte   |    7 +
 .../src/lib/components/ui/scroll-area/index.ts     |   10 +
 .../ui/scroll-area/scroll-area-scrollbar.svelte    |   31 +
 .../components/ui/scroll-area/scroll-area.svelte   |   40 +
 .../webui/src/lib/components/ui/select/index.ts    |   37 +
 .../lib/components/ui/select/select-content.svelte |  111 +
 .../ui/select/select-group-heading.svelte          |   21 +
 .../lib/components/ui/select/select-group.svelte   |    7 +
 .../lib/components/ui/select/select-item.svelte    |   38 +
 .../lib/components/ui/select/select-label.svelte   |   20 +
 .../ui/select/select-scroll-down-button.svelte     |   20 +
 .../ui/select/select-scroll-up-button.svelte       |   20 +
 .../components/ui/select/select-separator.svelte   |   18 +
 .../lib/components/ui/select/select-trigger.svelte |   40 +
 .../webui/src/lib/components/ui/separator/index.ts |    7 +
 .../lib/components/ui/separator/separator.svelte   |   20 +
 .../webui/src/lib/components/ui/sheet/index.ts     |   36 +
 .../src/lib/components/ui/sheet/sheet-close.svelte |    7 +
 .../lib/components/ui/sheet/sheet-content.svelte   |   60 +
 .../components/ui/sheet/sheet-description.svelte   |   17 +
 .../lib/components/ui/sheet/sheet-footer.svelte    |   20 +
 .../lib/components/ui/sheet/sheet-header.svelte    |   20 +
 .../lib/components/ui/sheet/sheet-overlay.svelte   |   20 +
 .../src/lib/components/ui/sheet/sheet-title.svelte |   17 +
 .../lib/components/ui/sheet/sheet-trigger.svelte   |    7 +
 .../src/lib/components/ui/sidebar/constants.ts     |    6 +
 .../lib/components/ui/sidebar/context.svelte.ts    |   79 +
 .../webui/src/lib/components/ui/sidebar/index.ts   |   75 +
 .../components/ui/sidebar/sidebar-content.svelte   |   24 +
 .../components/ui/sidebar/sidebar-footer.svelte    |   21 +
 .../ui/sidebar/sidebar-group-action.svelte         |   36 +
 .../ui/sidebar/sidebar-group-content.svelte        |   21 +
 .../ui/sidebar/sidebar-group-label.svelte          |   34 +
 .../lib/components/ui/sidebar/sidebar-group.svelte |   21 +
 .../components/ui/sidebar/sidebar-header.svelte    |   21 +
 .../lib/components/ui/sidebar/sidebar-input.svelte |   21 +
 .../lib/components/ui/sidebar/sidebar-inset.svelte |   24 +
 .../ui/sidebar/sidebar-menu-action.svelte          |   43 +
 .../ui/sidebar/sidebar-menu-badge.svelte           |   29 +
 .../ui/sidebar/sidebar-menu-button.svelte          |  106 +
 .../components/ui/sidebar/sidebar-menu-item.svelte |   21 +
 .../ui/sidebar/sidebar-menu-skeleton.svelte        |   36 +
 .../ui/sidebar/sidebar-menu-sub-button.svelte      |   43 +
 .../ui/sidebar/sidebar-menu-sub-item.svelte        |   21 +
 .../components/ui/sidebar/sidebar-menu-sub.svelte  |   25 +
 .../lib/components/ui/sidebar/sidebar-menu.svelte  |   21 +
 .../components/ui/sidebar/sidebar-provider.svelte  |   50 +
 .../lib/components/ui/sidebar/sidebar-rail.svelte  |   36 +
 .../components/ui/sidebar/sidebar-separator.svelte |   19 +
 .../components/ui/sidebar/sidebar-trigger.svelte   |   35 +
 .../src/lib/components/ui/sidebar/sidebar.svelte   |  101 +
 .../webui/src/lib/components/ui/skeleton/index.ts  |    7 +
 .../src/lib/components/ui/skeleton/skeleton.svelte |   17 +
 .../webui/src/lib/components/ui/switch/index.ts    |    7 +
 .../src/lib/components/ui/switch/switch.svelte     |   29 +
 .../webui/src/lib/components/ui/table/index.ts     |   28 +
 .../src/lib/components/ui/table/table-body.svelte  |   20 +
 .../lib/components/ui/table/table-caption.svelte   |   20 +
 .../src/lib/components/ui/table/table-cell.svelte  |   23 +
 .../lib/components/ui/table/table-footer.svelte    |   20 +
 .../src/lib/components/ui/table/table-head.svelte  |   23 +
 .../lib/components/ui/table/table-header.svelte    |   20 +
 .../src/lib/components/ui/table/table-row.svelte   |   23 +
 .../webui/src/lib/components/ui/table/table.svelte |   22 +
 .../webui/src/lib/components/ui/textarea/index.ts  |    7 +
 .../src/lib/components/ui/textarea/textarea.svelte |   22 +
 .../webui/src/lib/components/ui/tooltip/index.ts   |   21 +
 .../components/ui/tooltip/tooltip-content.svelte   |   47 +
 .../components/ui/tooltip/tooltip-trigger.svelte   |    7 +
 .../server/webui/src/lib/components/ui/utils.ts    |   13 +
 .../server/webui/src/lib/constants/auto-scroll.ts  |    3 +
 .../webui/src/lib/constants/binary-detection.ts    |   14 +
 .../webui/src/lib/constants/default-context.ts     |    1 +
 .../src/lib/constants/floating-ui-constraints.ts   |    2 +
 .../tools/server/webui/src/lib/constants/icons.ts  |   32 +
 .../webui/src/lib/constants/input-classes.ts       |    6 +
 .../webui/src/lib/constants/latex-protection.ts    |   35 +
 .../server/webui/src/lib/constants/literal-html.ts |   15 +
 .../webui/src/lib/constants/localstorage-keys.ts   |    2 +
 .../webui/src/lib/constants/max-bundle-size.ts     |    1 +
 .../server/webui/src/lib/constants/precision.ts    |    2 +
 .../webui/src/lib/constants/processing-info.ts     |    1 +
 .../webui/src/lib/constants/settings-config.ts     |  117 +
 .../src/lib/constants/supported-file-types.ts      |  217 +
 .../webui/src/lib/constants/table-html-restorer.ts |   20 +
 .../webui/src/lib/constants/tooltip-config.ts      |    1 +
 .../server/webui/src/lib/constants/viewport.ts     |    1 +
 .../tools/server/webui/src/lib/enums/attachment.ts |   10 +
 llama.cpp/tools/server/webui/src/lib/enums/chat.ts |    4 +
 .../tools/server/webui/src/lib/enums/files.ts      |  206 +
 .../tools/server/webui/src/lib/enums/index.ts      |   23 +
 .../tools/server/webui/src/lib/enums/model.ts      |    5 +
 .../tools/server/webui/src/lib/enums/server.ts     |   20 +
 .../server/webui/src/lib/hooks/is-mobile.svelte.ts |    8 +
 .../hooks/use-model-change-validation.svelte.ts    |  118 +
 .../src/lib/hooks/use-processing-state.svelte.ts   |  262 +
 .../webui/src/lib/markdown/enhance-code-blocks.ts  |  162 +
 .../server/webui/src/lib/markdown/enhance-links.ts |   33 +
 .../server/webui/src/lib/markdown/literal-html.ts  |  121 +
 .../webui/src/lib/markdown/table-html-restorer.ts  |  181 +
 .../tools/server/webui/src/lib/services/chat.ts    |  784 ++
 .../server/webui/src/lib/services/database.ts      |  400 +
 .../tools/server/webui/src/lib/services/index.ts   |    5 +
 .../tools/server/webui/src/lib/services/models.ts  |  124 +
 .../webui/src/lib/services/parameter-sync.spec.ts  |  148 +
 .../webui/src/lib/services/parameter-sync.ts       |  279 +
 .../tools/server/webui/src/lib/services/props.ts   |   77 +
 .../server/webui/src/lib/stores/chat.svelte.ts     | 1487 ++++
 .../webui/src/lib/stores/conversations.svelte.ts   |  662 ++
 .../server/webui/src/lib/stores/models.svelte.ts   |  605 ++
 .../webui/src/lib/stores/persisted.svelte.ts       |   50 +
 .../server/webui/src/lib/stores/server.svelte.ts   |  140 +
 .../server/webui/src/lib/stores/settings.svelte.ts |  421 +
 .../tools/server/webui/src/lib/types/api.d.ts      |  430 +
 .../tools/server/webui/src/lib/types/chat.d.ts     |   55 +
 .../tools/server/webui/src/lib/types/database.d.ts |   85 +
 .../tools/server/webui/src/lib/types/index.ts      |   70 +
 .../tools/server/webui/src/lib/types/models.d.ts   |   21 +
 .../tools/server/webui/src/lib/types/settings.d.ts |   67 +
 .../server/webui/src/lib/utils/api-headers.ts      |   22 +
 .../webui/src/lib/utils/api-key-validation.ts      |   45 +
 .../webui/src/lib/utils/attachment-display.ts      |   61 +
 .../server/webui/src/lib/utils/attachment-type.ts  |  105 +
 .../server/webui/src/lib/utils/audio-recording.ts  |  226 +
 .../webui/src/lib/utils/autoresize-textarea.ts     |   10 +
 .../tools/server/webui/src/lib/utils/branching.ts  |  283 +
 .../server/webui/src/lib/utils/browser-only.ts     |   35 +
 .../tools/server/webui/src/lib/utils/clipboard.ts  |  259 +
 .../server/webui/src/lib/utils/config-helpers.ts   |   51 +
 .../webui/src/lib/utils/conversation-utils.ts      |   30 +
 .../webui/src/lib/utils/convert-files-to-extra.ts  |  192 +
 .../server/webui/src/lib/utils/file-preview.ts     |   36 +
 .../tools/server/webui/src/lib/utils/file-type.ts  |  222 +
 .../tools/server/webui/src/lib/utils/formatters.ts |   53 +
 .../tools/server/webui/src/lib/utils/index.ts      |   95 +
 .../server/webui/src/lib/utils/is-ime-composing.ts |    5 +
 .../server/webui/src/lib/utils/latex-protection.ts |  270 +
 .../src/lib/utils/modality-file-validation.ts      |  162 +
 .../server/webui/src/lib/utils/model-names.ts      |   56 +
 .../server/webui/src/lib/utils/pdf-processing.ts   |  150 +
 .../server/webui/src/lib/utils/portal-to-body.ts   |   20 +
 .../tools/server/webui/src/lib/utils/precision.ts  |   25 +
 .../webui/src/lib/utils/process-uploaded-files.ts  |  136 +
 .../tools/server/webui/src/lib/utils/svg-to-png.ts |   71 +
 .../src/lib/utils/syntax-highlight-language.ts     |  145 +
 .../tools/server/webui/src/lib/utils/text-files.ts |   97 +
 llama.cpp/tools/server/webui/src/lib/utils/text.ts |    7 +
 .../server/webui/src/lib/utils/webp-to-png.ts      |   73 +
 .../tools/server/webui/src/routes/+error.svelte    |   70 +
 .../tools/server/webui/src/routes/+layout.svelte   |  223 +
 .../tools/server/webui/src/routes/+page.svelte     |   91 +
 llama.cpp/tools/server/webui/src/routes/+page.ts   |    6 +
 .../server/webui/src/routes/chat/[id]/+page.svelte |  176 +
 .../server/webui/src/routes/chat/[id]/+page.ts     |    6 +
 .../server/webui/src/styles/katex-custom.scss      |   13 +
 llama.cpp/tools/server/webui/static/favicon.svg    |    1 +
 llama.cpp/tools/server/webui/static/loading.html   |   12 +
 llama.cpp/tools/server/webui/svelte.config.js      |   34 +
 .../tests/client/components/TestWrapper.svelte     |   17 +
 .../server/webui/tests/client/page.svelte.test.ts  |   11 +
 .../tools/server/webui/tests/e2e/demo.test.ts      |    6 +
 .../webui/tests/stories/ChatForm.stories.svelte    |  161 +
 .../webui/tests/stories/ChatMessage.stories.svelte |  207 +
 .../tests/stories/ChatSettings.stories.svelte      |   19 +
 .../webui/tests/stories/ChatSidebar.stories.svelte |   97 +
 .../server/webui/tests/stories/Introduction.mdx    |   44 +
 .../tests/stories/MarkdownContent.stories.svelte   |  130 +
 .../webui/tests/stories/fixtures/ai-tutorial.ts    |  164 +
 .../webui/tests/stories/fixtures/api-docs.ts       |  160 +
 .../webui/tests/stories/fixtures/assets/1.jpg      |  Bin 0 -> 44891 bytes
 .../fixtures/assets/beautiful-flowers-lotus.webp   |  Bin 0 -> 817630 bytes
 .../tests/stories/fixtures/assets/example.pdf      |  Bin 0 -> 351048 bytes
 .../tests/stories/fixtures/assets/hf-logo.svg      |    8 +
 .../webui/tests/stories/fixtures/blog-post.ts      |  125 +
 .../webui/tests/stories/fixtures/data-analysis.ts  |  124 +
 .../server/webui/tests/stories/fixtures/empty.ts   |    2 +
 .../webui/tests/stories/fixtures/math-formulas.ts  |  221 +
 .../server/webui/tests/stories/fixtures/readme.ts  |  136 +
 .../tests/stories/fixtures/storybook-mocks.ts      |   81 +
 .../server/webui/tests/unit/clipboard.test.ts      |  423 +
 .../webui/tests/unit/latex-protection.test.ts      |  376 +
 .../server/webui/tests/unit/model-names.test.ts    |   51 +
 llama.cpp/tools/server/webui/tsconfig.json         |   19 +
 llama.cpp/tools/server/webui/vite.config.ts        |  166 +
 .../tools/server/webui/vitest-setup-client.ts      |    2 +
 llama.cpp/tools/tokenize/CMakeLists.txt            |    7 +
 llama.cpp/tools/tokenize/tokenize.cpp              |  416 +
 llama.cpp/tools/tts/CMakeLists.txt                 |    8 +
 llama.cpp/tools/tts/README.md                      |  117 +
 llama.cpp/tools/tts/convert_pt_to_hf.py            |  180 +
 llama.cpp/tools/tts/tts-outetts.py                 |  299 +
 llama.cpp/tools/tts/tts.cpp                        | 1093 +++
 527 files changed, 98897 insertions(+)
 create mode 100644 llama.cpp/tools/CMakeLists.txt
 create mode 100644 llama.cpp/tools/batched-bench/CMakeLists.txt
 create mode 100644 llama.cpp/tools/batched-bench/README.md
 create mode 100644 llama.cpp/tools/batched-bench/batched-bench.cpp
 create mode 100644 llama.cpp/tools/cli/CMakeLists.txt
 create mode 100644 llama.cpp/tools/cli/README.md
 create mode 100644 llama.cpp/tools/cli/cli.cpp
 create mode 100644 llama.cpp/tools/completion/CMakeLists.txt
 create mode 100644 llama.cpp/tools/completion/README.md
 create mode 100644 llama.cpp/tools/completion/completion.cpp
 create mode 100644 llama.cpp/tools/cvector-generator/CMakeLists.txt
 create mode 100644 llama.cpp/tools/cvector-generator/README.md
 create mode 100644 llama.cpp/tools/cvector-generator/completions.txt
 create mode 100644 llama.cpp/tools/cvector-generator/cvector-generator.cpp
 create mode 100644 llama.cpp/tools/cvector-generator/mean.hpp
 create mode 100644 llama.cpp/tools/cvector-generator/negative.txt
 create mode 100644 llama.cpp/tools/cvector-generator/pca.hpp
 create mode 100644 llama.cpp/tools/cvector-generator/positive.txt
 create mode 100644 llama.cpp/tools/export-lora/CMakeLists.txt
 create mode 100644 llama.cpp/tools/export-lora/README.md
 create mode 100644 llama.cpp/tools/export-lora/export-lora.cpp
 create mode 100644 llama.cpp/tools/fit-params/CMakeLists.txt
 create mode 100644 llama.cpp/tools/fit-params/README.md
 create mode 100644 llama.cpp/tools/fit-params/fit-params.cpp
 create mode 100644 llama.cpp/tools/gguf-split/CMakeLists.txt
 create mode 100644 llama.cpp/tools/gguf-split/README.md
 create mode 100644 llama.cpp/tools/gguf-split/gguf-split.cpp
 create mode 100755 llama.cpp/tools/gguf-split/tests.sh
 create mode 100644 llama.cpp/tools/imatrix/CMakeLists.txt
 create mode 100644 llama.cpp/tools/imatrix/README.md
 create mode 100644 llama.cpp/tools/imatrix/imatrix.cpp
 create mode 100644 llama.cpp/tools/llama-bench/CMakeLists.txt
 create mode 100644 llama.cpp/tools/llama-bench/README.md
 create mode 100644 llama.cpp/tools/llama-bench/llama-bench.cpp
 create mode 100644 llama.cpp/tools/mtmd/CMakeLists.txt
 create mode 100644 llama.cpp/tools/mtmd/README.md
 create mode 100644 llama.cpp/tools/mtmd/clip-graph.h
 create mode 100644 llama.cpp/tools/mtmd/clip-impl.h
 create mode 100644 llama.cpp/tools/mtmd/clip-model.h
 create mode 100644 llama.cpp/tools/mtmd/clip.cpp
 create mode 100644 llama.cpp/tools/mtmd/clip.h
 create mode 100644 llama.cpp/tools/mtmd/deprecation-warning.cpp
 create mode 100644 llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
 create mode 100644 llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py
 create mode 100644 llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py
 create mode 100644 llama.cpp/tools/mtmd/legacy-models/llava_surgery.py
 create mode 100644 llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py
 create mode 100644 llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
 create mode 100644 llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py
 create mode 100644 llama.cpp/tools/mtmd/models/cogvlm.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/conformer.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/glm4v.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/internvl.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/kimik25.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/kimivl.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/llama4.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/llava.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/minicpmv.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/mobilenetv5.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/models.h
 create mode 100644 llama.cpp/tools/mtmd/models/pixtral.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/qwen2vl.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/qwen3vl.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/siglip.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/whisper-enc.cpp
 create mode 100644 llama.cpp/tools/mtmd/models/youtuvl.cpp
 create mode 100644 llama.cpp/tools/mtmd/mtmd-audio.cpp
 create mode 100644 llama.cpp/tools/mtmd/mtmd-audio.h
 create mode 100644 llama.cpp/tools/mtmd/mtmd-cli.cpp
 create mode 100644 llama.cpp/tools/mtmd/mtmd-helper.cpp
 create mode 100644 llama.cpp/tools/mtmd/mtmd-helper.h
 create mode 100644 llama.cpp/tools/mtmd/mtmd.cpp
 create mode 100644 llama.cpp/tools/mtmd/mtmd.h
 create mode 100644 llama.cpp/tools/mtmd/requirements.txt
 create mode 100644 llama.cpp/tools/mtmd/test-1.jpeg
 create mode 100644 llama.cpp/tools/mtmd/test-2.mp3
 create mode 100755 llama.cpp/tools/mtmd/tests.sh
 create mode 100644 llama.cpp/tools/perplexity/CMakeLists.txt
 create mode 100644 llama.cpp/tools/perplexity/README.md
 create mode 100644 llama.cpp/tools/perplexity/perplexity.cpp
 create mode 100644 llama.cpp/tools/quantize/CMakeLists.txt
 create mode 100644 llama.cpp/tools/quantize/README.md
 create mode 100644 llama.cpp/tools/quantize/quantize.cpp
 create mode 100644 llama.cpp/tools/quantize/tests.sh
 create mode 100644 llama.cpp/tools/rpc/CMakeLists.txt
 create mode 100644 llama.cpp/tools/rpc/README.md
 create mode 100644 llama.cpp/tools/rpc/rpc-server.cpp
 create mode 100644 llama.cpp/tools/server/CMakeLists.txt
 create mode 100644 llama.cpp/tools/server/README-dev.md
 create mode 100644 llama.cpp/tools/server/README.md
 create mode 100644 llama.cpp/tools/server/bench/README.md
 create mode 100644 llama.cpp/tools/server/bench/bench.py
 create mode 100644 llama.cpp/tools/server/bench/prometheus.yml
 create mode 100644 llama.cpp/tools/server/bench/requirements.txt
 create mode 100644 llama.cpp/tools/server/bench/script.js
 create mode 100755 llama.cpp/tools/server/chat-llama2.sh
 create mode 100644 llama.cpp/tools/server/chat.mjs
 create mode 100755 llama.cpp/tools/server/chat.sh
 create mode 100644 llama.cpp/tools/server/public/index.html.gz
 create mode 100644 llama.cpp/tools/server/public/loading.html
 create mode 100755 llama.cpp/tools/server/public_legacy/colorthemes.css
 create mode 100644 llama.cpp/tools/server/public_legacy/completion.js
 create mode 100644 llama.cpp/tools/server/public_legacy/favicon.ico
 create mode 100644 llama.cpp/tools/server/public_legacy/index-new.html
 create mode 100644 llama.cpp/tools/server/public_legacy/index.html
 create mode 100644 llama.cpp/tools/server/public_legacy/index.js
 create mode 100644 llama.cpp/tools/server/public_legacy/json-schema-to-grammar.mjs
 create mode 100644 llama.cpp/tools/server/public_legacy/loading.html
 create mode 100644 llama.cpp/tools/server/public_legacy/prompt-formats.js
 create mode 100644 llama.cpp/tools/server/public_legacy/style.css
 create mode 100644 llama.cpp/tools/server/public_legacy/system-prompts.js
 create mode 100755 llama.cpp/tools/server/public_legacy/theme-beeninorder.css
 create mode 100755 llama.cpp/tools/server/public_legacy/theme-ketivah.css
 create mode 100755 llama.cpp/tools/server/public_legacy/theme-mangotango.css
 create mode 100755 llama.cpp/tools/server/public_legacy/theme-playground.css
 create mode 100755 llama.cpp/tools/server/public_legacy/theme-polarnight.css
 create mode 100755 llama.cpp/tools/server/public_legacy/theme-snowstorm.css
 create mode 100644 llama.cpp/tools/server/public_simplechat/datautils.mjs
 create mode 100644 llama.cpp/tools/server/public_simplechat/index.html
 create mode 100644 llama.cpp/tools/server/public_simplechat/readme.md
 create mode 100644 llama.cpp/tools/server/public_simplechat/simplechat.css
 create mode 100644 llama.cpp/tools/server/public_simplechat/simplechat.js
 create mode 100644 llama.cpp/tools/server/public_simplechat/simplechat_screens.webp
 create mode 100644 llama.cpp/tools/server/public_simplechat/ui.mjs
 create mode 100644 llama.cpp/tools/server/server-common.cpp
 create mode 100644 llama.cpp/tools/server/server-common.h
 create mode 100644 llama.cpp/tools/server/server-context.cpp
 create mode 100644 llama.cpp/tools/server/server-context.h
 create mode 100644 llama.cpp/tools/server/server-http.cpp
 create mode 100644 llama.cpp/tools/server/server-http.h
 create mode 100644 llama.cpp/tools/server/server-models.cpp
 create mode 100644 llama.cpp/tools/server/server-models.h
 create mode 100644 llama.cpp/tools/server/server-queue.cpp
 create mode 100644 llama.cpp/tools/server/server-queue.h
 create mode 100644 llama.cpp/tools/server/server-task.cpp
 create mode 100644 llama.cpp/tools/server/server-task.h
 create mode 100644 llama.cpp/tools/server/server.cpp
 create mode 100644 llama.cpp/tools/server/tests/.gitignore
 create mode 100644 llama.cpp/tools/server/tests/README.md
 create mode 100644 llama.cpp/tools/server/tests/conftest.py
 create mode 100644 llama.cpp/tools/server/tests/pytest.ini
 create mode 100644 llama.cpp/tools/server/tests/requirements.txt
 create mode 100755 llama.cpp/tools/server/tests/tests.sh
 create mode 100644 llama.cpp/tools/server/tests/unit/test_basic.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_chat_completion.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_compat_anthropic.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_compat_oai_responses.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_completion.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_ctx_shift.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_embedding.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_infill.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_lora.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_rerank.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_router.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_security.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_sleep.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_slot_save.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_speculative.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_template.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_tokenize.py
 create mode 100755 llama.cpp/tools/server/tests/unit/test_tool_call.py
 create mode 100644 llama.cpp/tools/server/tests/unit/test_vision_api.py
 create mode 100644 llama.cpp/tools/server/tests/utils.py
 create mode 100644 llama.cpp/tools/server/themes/README.md
 create mode 100644 llama.cpp/tools/server/themes/buttons-top/README.md
 create mode 100644 llama.cpp/tools/server/themes/buttons-top/buttons_top.png
 create mode 100644 llama.cpp/tools/server/themes/buttons-top/favicon.ico
 create mode 100644 llama.cpp/tools/server/themes/buttons-top/index.html
 create mode 100644 llama.cpp/tools/server/themes/wild/README.md
 create mode 100644 llama.cpp/tools/server/themes/wild/favicon.ico
 create mode 100644 llama.cpp/tools/server/themes/wild/index.html
 create mode 100644 llama.cpp/tools/server/themes/wild/llama_cpp.png
 create mode 100644 llama.cpp/tools/server/themes/wild/llamapattern.png
 create mode 100644 llama.cpp/tools/server/themes/wild/wild.png
 create mode 100644 llama.cpp/tools/server/webui/.gitignore
 create mode 100644 llama.cpp/tools/server/webui/.npmrc
 create mode 100644 llama.cpp/tools/server/webui/.prettierignore
 create mode 100644 llama.cpp/tools/server/webui/.prettierrc
 create mode 100644 llama.cpp/tools/server/webui/.storybook/ModeWatcherDecorator.svelte
 create mode 100644 llama.cpp/tools/server/webui/.storybook/TooltipProviderDecorator.svelte
 create mode 100644 llama.cpp/tools/server/webui/.storybook/main.ts
 create mode 100644 llama.cpp/tools/server/webui/.storybook/preview.ts
 create mode 100644 llama.cpp/tools/server/webui/.storybook/vitest.setup.ts
 create mode 100644 llama.cpp/tools/server/webui/README.md
 create mode 100644 llama.cpp/tools/server/webui/components.json
 create mode 100644 llama.cpp/tools/server/webui/docs/architecture/high-level-architecture-simplified.md
 create mode 100644 llama.cpp/tools/server/webui/docs/architecture/high-level-architecture.md
 create mode 100644 llama.cpp/tools/server/webui/docs/flows/chat-flow.md
 create mode 100644 llama.cpp/tools/server/webui/docs/flows/conversations-flow.md
 create mode 100644 llama.cpp/tools/server/webui/docs/flows/data-flow-simplified-model-mode.md
 create mode 100644 llama.cpp/tools/server/webui/docs/flows/data-flow-simplified-router-mode.md
 create mode 100644 llama.cpp/tools/server/webui/docs/flows/database-flow.md
 create mode 100644 llama.cpp/tools/server/webui/docs/flows/models-flow.md
 create mode 100644 llama.cpp/tools/server/webui/docs/flows/server-flow.md
 create mode 100644 llama.cpp/tools/server/webui/docs/flows/settings-flow.md
 create mode 100644 llama.cpp/tools/server/webui/eslint.config.js
 create mode 100644 llama.cpp/tools/server/webui/package-lock.json
 create mode 100644 llama.cpp/tools/server/webui/package.json
 create mode 100644 llama.cpp/tools/server/webui/playwright.config.ts
 create mode 100644 llama.cpp/tools/server/webui/scripts/dev.sh
 create mode 100755 llama.cpp/tools/server/webui/scripts/install-git-hooks.sh
 create mode 100755 llama.cpp/tools/server/webui/scripts/post-build.sh
 create mode 100644 llama.cpp/tools/server/webui/src/app.css
 create mode 100644 llama.cpp/tools/server/webui/src/app.d.ts
 create mode 100644 llama.cpp/tools/server/webui/src/app.html
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreview.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailImage.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsViewAll.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormHelperText.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageBranchingControls.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageThinkingBlock.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenDragOverlay.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFooter.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsParameterSourceIndicator.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarActions.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarConversationItem.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/handle-mobile-sidebar-item-click.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentPreview.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentsViewAll.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatError.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatSettings.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConfirmation.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConversationSelection.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConversationTitleUpdate.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogEmptyFileAlert.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogModelInformation.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogModelNotAvailable.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/ActionButton.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/ActionDropdown.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeInfo.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeModality.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/CodePreviewDialog.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/CopyToClipboardIcon.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/RemoveButton.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/SyntaxHighlightedCode.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/server/ServerLoadingSplash.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/server/ServerStatus.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-action.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-cancel.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-content.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-description.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-footer.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-header.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-overlay.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-title.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-trigger.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert-description.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert-title.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/badge/badge.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/badge/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/button/button.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/button/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card-action.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card-content.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card-description.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card-footer.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card-header.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card-title.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/checkbox/checkbox.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/checkbox/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible-content.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible-trigger.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-close.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-content.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-description.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-footer.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-header.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-overlay.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-title.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-trigger.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-checkbox-item.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-content.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-group-heading.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-group.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-item.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-label.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-radio-group.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-radio-item.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-separator.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-shortcut.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-sub-content.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-sub-trigger.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-trigger.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/input/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/input/input.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/label/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/label/label.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/popover/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/scroll-area-scrollbar.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/scroll-area.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-content.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-group-heading.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-group.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-item.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-label.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-scroll-down-button.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-scroll-up-button.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-separator.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-trigger.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/separator/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/separator/separator.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-close.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-content.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-description.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-footer.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-header.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-overlay.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-title.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-trigger.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/constants.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/context.svelte.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-content.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-footer.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-action.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-content.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-label.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-header.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-input.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-inset.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-action.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-badge.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-button.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-item.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-skeleton.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub-button.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub-item.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-provider.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-rail.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-separator.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-trigger.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/skeleton/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/skeleton/skeleton.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/switch/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/switch/switch.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-body.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-caption.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-cell.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-footer.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-head.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-header.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-row.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/textarea/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/textarea/textarea.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/tooltip-content.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/tooltip-trigger.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/utils.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/auto-scroll.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/binary-detection.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/default-context.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/floating-ui-constraints.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/icons.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/input-classes.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/latex-protection.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/literal-html.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/localstorage-keys.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/max-bundle-size.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/precision.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/processing-info.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/supported-file-types.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/table-html-restorer.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/tooltip-config.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/viewport.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/enums/attachment.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/enums/chat.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/enums/files.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/enums/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/enums/model.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/enums/server.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/hooks/is-mobile.svelte.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/hooks/use-model-change-validation.svelte.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/markdown/enhance-links.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/markdown/literal-html.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/markdown/table-html-restorer.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/services/chat.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/services/database.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/services/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/services/models.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/services/parameter-sync.spec.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/services/parameter-sync.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/services/props.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/stores/conversations.svelte.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/stores/models.svelte.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/stores/persisted.svelte.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/stores/server.svelte.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/types/api.d.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/types/chat.d.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/types/database.d.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/types/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/types/models.d.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/types/settings.d.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/api-headers.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/api-key-validation.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/attachment-display.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/attachment-type.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/audio-recording.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/autoresize-textarea.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/branching.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/browser-only.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/config-helpers.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/conversation-utils.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/convert-files-to-extra.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/file-preview.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/file-type.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/formatters.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/index.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/is-ime-composing.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/modality-file-validation.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/model-names.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/portal-to-body.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/precision.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/process-uploaded-files.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/svg-to-png.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/syntax-highlight-language.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/text-files.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/text.ts
 create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/webp-to-png.ts
 create mode 100644 llama.cpp/tools/server/webui/src/routes/+error.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/routes/+layout.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/routes/+page.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/routes/+page.ts
 create mode 100644 llama.cpp/tools/server/webui/src/routes/chat/[id]/+page.svelte
 create mode 100644 llama.cpp/tools/server/webui/src/routes/chat/[id]/+page.ts
 create mode 100644 llama.cpp/tools/server/webui/src/styles/katex-custom.scss
 create mode 100644 llama.cpp/tools/server/webui/static/favicon.svg
 create mode 100644 llama.cpp/tools/server/webui/static/loading.html
 create mode 100644 llama.cpp/tools/server/webui/svelte.config.js
 create mode 100644 llama.cpp/tools/server/webui/tests/client/components/TestWrapper.svelte
 create mode 100644 llama.cpp/tools/server/webui/tests/client/page.svelte.test.ts
 create mode 100644 llama.cpp/tools/server/webui/tests/e2e/demo.test.ts
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/ChatForm.stories.svelte
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/ChatMessage.stories.svelte
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/ChatSettings.stories.svelte
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/ChatSidebar.stories.svelte
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/Introduction.mdx
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/MarkdownContent.stories.svelte
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/ai-tutorial.ts
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/api-docs.ts
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/assets/1.jpg
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/assets/beautiful-flowers-lotus.webp
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/assets/example.pdf
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/assets/hf-logo.svg
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/blog-post.ts
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/data-analysis.ts
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/empty.ts
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/math-formulas.ts
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/readme.ts
 create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/storybook-mocks.ts
 create mode 100644 llama.cpp/tools/server/webui/tests/unit/clipboard.test.ts
 create mode 100644 llama.cpp/tools/server/webui/tests/unit/latex-protection.test.ts
 create mode 100644 llama.cpp/tools/server/webui/tests/unit/model-names.test.ts
 create mode 100644 llama.cpp/tools/server/webui/tsconfig.json
 create mode 100644 llama.cpp/tools/server/webui/vite.config.ts
 create mode 100644 llama.cpp/tools/server/webui/vitest-setup-client.ts
 create mode 100644 llama.cpp/tools/tokenize/CMakeLists.txt
 create mode 100644 llama.cpp/tools/tokenize/tokenize.cpp
 create mode 100644 llama.cpp/tools/tts/CMakeLists.txt
 create mode 100644 llama.cpp/tools/tts/README.md
 create mode 100644 llama.cpp/tools/tts/convert_pt_to_hf.py
 create mode 100644 llama.cpp/tools/tts/tts-outetts.py
 create mode 100644 llama.cpp/tools/tts/tts.cpp

(limited to 'llama.cpp/tools')

diff --git a/llama.cpp/tools/CMakeLists.txt b/llama.cpp/tools/CMakeLists.txt
new file mode 100644
index 0000000..518f8b9
--- /dev/null
+++ b/llama.cpp/tools/CMakeLists.txt
@@ -0,0 +1,40 @@
+# dependencies
+
+find_package(Threads REQUIRED)
+
+# third-party
+
+# ...
+
+# flags
+
+llama_add_compile_flags()
+
+# tools
+
+if (EMSCRIPTEN)
+else()
+    add_subdirectory(batched-bench)
+    add_subdirectory(gguf-split)
+    add_subdirectory(imatrix)
+    add_subdirectory(llama-bench)
+    add_subdirectory(completion)
+    add_subdirectory(perplexity)
+    add_subdirectory(quantize)
+    if (LLAMA_BUILD_SERVER)
+        add_subdirectory(cli)
+        add_subdirectory(server)
+    endif()
+    add_subdirectory(tokenize)
+    add_subdirectory(tts)
+    add_subdirectory(mtmd)
+    if (GGML_RPC)
+        add_subdirectory(rpc)
+    endif()
+    if (NOT GGML_BACKEND_DL)
+        # these examples use the backends directly and cannot be built with dynamic loading
+        add_subdirectory(cvector-generator)
+        add_subdirectory(export-lora)
+    endif()
+    add_subdirectory(fit-params)
+endif()
diff --git a/llama.cpp/tools/batched-bench/CMakeLists.txt b/llama.cpp/tools/batched-bench/CMakeLists.txt
new file mode 100644
index 0000000..4a46b57
--- /dev/null
+++ b/llama.cpp/tools/batched-bench/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-batched-bench)
+add_executable(${TARGET} batched-bench.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/llama.cpp/tools/batched-bench/README.md b/llama.cpp/tools/batched-bench/README.md
new file mode 100644
index 0000000..df67c47
--- /dev/null
+++ b/llama.cpp/tools/batched-bench/README.md
@@ -0,0 +1,60 @@
+# llama.cpp/example/batched-bench
+
+Benchmark the batched decoding performance of `llama.cpp`
+
+## Usage
+
+There are 2 modes of operation:
+
+- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
+- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
+
+```bash
+./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
+
+# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
+./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
+
+# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
+./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
+
+# custom set of batches
+./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
+```
+
+## Sample results
+
+- `PP` - prompt tokens per batch
+- `TG` - generated tokens per batch
+- `B` - number of batches
+- `N_KV` - required KV cache size
+- `T_PP` - prompt processing time (i.e. time to first token)
+- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
+- `T_TG` - time to generate all batches
+- `S_TG` - text generation speed (`(B*TG)/T_TG`)
+- `T` - total time
+- `S` - total speed (i.e. all tokens / total time)
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   128 |    128 |    1 |    256 |    0.108 |  1186.64 |    3.079 |    41.57 |    3.187 |    80.32 |
+|   128 |    128 |    2 |    512 |    0.198 |  1295.19 |    5.029 |    50.90 |    5.227 |    97.95 |
+|   128 |    128 |    4 |   1024 |    0.373 |  1373.96 |    6.878 |    74.44 |    7.251 |   141.23 |
+|   128 |    128 |    8 |   2048 |    0.751 |  1363.27 |    7.344 |   139.43 |    8.095 |   252.99 |
+|   128 |    128 |   16 |   4096 |    1.570 |  1304.68 |    8.455 |   242.23 |   10.024 |   408.60 |
+|   128 |    128 |   32 |   8192 |    3.408 |  1201.73 |    8.801 |   465.40 |   12.209 |   670.96 |
+|   128 |    256 |    1 |    384 |    0.107 |  1196.70 |    6.329 |    40.45 |    6.436 |    59.67 |
+|   128 |    256 |    2 |    768 |    0.194 |  1317.45 |   10.239 |    50.00 |   10.433 |    73.61 |
+|   128 |    256 |    4 |   1536 |    0.366 |  1399.03 |   13.960 |    73.35 |   14.326 |   107.22 |
+|   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
+|   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
+|   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
+
+### JSONL output
+
+Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
+
+```json lines
+{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
+{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
+```
diff --git a/llama.cpp/tools/batched-bench/batched-bench.cpp b/llama.cpp/tools/batched-bench/batched-bench.cpp
new file mode 100644
index 0000000..0f627c5
--- /dev/null
+++ b/llama.cpp/tools/batched-bench/batched-bench.cpp
@@ -0,0 +1,256 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+    LOG("\n");
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
+        return 1;
+    }
+
+    common_init();
+
+    int is_pp_shared   = params.is_pp_shared;
+    int is_tg_separate = params.is_tg_separate;
+
+    std::vector<int> n_pp = params.n_pp;
+    std::vector<int> n_tg = params.n_tg;
+    std::vector<int> n_pl = params.n_pl;
+
+    // init LLM
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // initialize the model
+
+    llama_model_params model_params = common_model_params_to_llama(params);
+
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
+
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    llama_context_params ctx_params = common_context_params_to_llama(params);
+
+    // ensure enough sequences are available
+    ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
+
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
+
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        llama_model_free(model);
+        return 1;
+    }
+
+    const llama_vocab * vocab   = llama_model_get_vocab(model);
+    const int32_t       n_vocab = llama_vocab_n_tokens(vocab);
+
+    const auto get_token_rand = [n_vocab]() -> llama_token {
+        return std::rand() % n_vocab;
+    };
+
+    auto * mem = llama_get_memory(ctx);
+
+    const int32_t n_kv_max = llama_n_ctx(ctx);
+
+    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
+
+    // decode in batches of ctx_params.n_batch tokens
+    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch, bool synchronize) {
+        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
+
+            llama_batch batch_view = {
+                n_tokens,
+                batch.token    + i,
+                nullptr,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
+            };
+
+            const int ret = llama_decode(ctx, batch_view);
+            if (ret != 0) {
+                LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                return false;
+            }
+
+            if (synchronize) {
+                llama_synchronize(ctx);
+            }
+        }
+
+        return true;
+    };
+
+    // warm up
+    {
+        for (int i = 0; i < 16; ++i) {
+            common_batch_add(batch, get_token_rand(), i, { 0 }, false);
+        }
+
+        if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            llama_free(ctx);
+            llama_model_free(model);
+            return 1;
+        }
+    }
+
+    if (!params.batched_bench_output_jsonl) {
+        LOG("\n");
+        LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, is_tg_separate = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, int(params.flash_attn_type), is_pp_shared, is_tg_separate, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+        LOG("\n");
+        LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
+        LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+    }
+
+    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
+        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
+            for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
+                const int pp = n_pp[i_pp];
+                const int tg = n_tg[i_tg];
+                const int pl = n_pl[i_pl];
+
+                const int n_ctx_req = is_pp_shared ? (params.kv_unified ? pp : pl*pp) + pl*tg : pl*(pp + tg);
+
+                if (n_ctx_req > n_kv_max) {
+                    continue;
+                }
+
+                common_batch_clear(batch);
+
+                for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
+                    for (int i = 0; i < pp; ++i) {
+                        common_batch_add(batch, get_token_rand(), i, { j }, i == pp - 1);
+                    }
+                }
+
+                llama_memory_clear(mem, false);
+
+                const auto t_pp_start = ggml_time_us();
+
+                if (!decode_helper(ctx, batch, ctx_params.n_batch, false)) {
+                    LOG_ERR("%s: llama_decode() failed\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(model);
+                    return 1;
+                }
+
+                llama_synchronize(ctx);
+
+                const auto t_pp_end = ggml_time_us();
+
+                if (is_pp_shared) {
+                    for (int32_t i = 1; i < pl; ++i) {
+                        llama_memory_seq_cp(mem, 0, i, -1, -1);
+                    }
+
+                    if (!params.kv_unified) {
+                        // run one dummy token to apply the memory copy
+                        common_batch_clear(batch);
+                        common_batch_add(batch, get_token_rand(), pp + 0, { 0 }, true);
+                        if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
+                            LOG_ERR("%s: llama_decode() failed\n", __func__);
+                            llama_free(ctx);
+                            llama_model_free(model);
+                            return 1;
+                        }
+                        llama_memory_seq_rm(mem, 0, pp, -1);
+                    }
+                }
+
+                const auto t_tg_start = ggml_time_us();
+
+                if (is_tg_separate) {
+                    // decode pattern:
+                    // 0 0 0 ... 1 1 1 ... 2 2 2 ... 3 3 3 ...
+                    for (int j = 0; j < pl; ++j) {
+                        for (int i = 0; i < tg; ++i) {
+                            common_batch_clear(batch);
+
+                            common_batch_add(batch, get_token_rand(), pp + i, { j }, true);
+
+                            if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
+                                LOG_ERR("%s: llama_decode() failed\n", __func__);
+                                llama_free(ctx);
+                                llama_model_free(model);
+                                return 1;
+                            }
+                        }
+                    }
+                } else {
+                    // decode pattern:
+                    // 0123 0123 0123 ...
+                    for (int i = 0; i < tg; ++i) {
+                        common_batch_clear(batch);
+
+                        for (int j = 0; j < pl; ++j) {
+                            common_batch_add(batch, get_token_rand(), pp + i, { j }, true);
+                        }
+
+                        if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
+                            LOG_ERR("%s: llama_decode() failed\n", __func__);
+                            llama_free(ctx);
+                            llama_model_free(model);
+                            return 1;
+                        }
+                    }
+                }
+
+                const auto t_tg_end = ggml_time_us();
+
+                const int32_t n_kv = n_ctx_req;
+
+                const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
+                const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
+                const float t    = t_pp + t_tg;
+
+                const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
+                const float speed_tg = pl*tg / t_tg;
+                const float speed    = ((is_pp_shared ? pp : pl*pp) + pl*tg) / t;
+
+                if(params.batched_bench_output_jsonl) {
+                    LOG(
+                        "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
+                        "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
+                        n_kv_max, params.n_batch, params.n_ubatch, int(params.flash_attn_type), params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
+                        pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
+                    );
+                } else {
+                    LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+                }
+            }
+        }
+    }
+
+    LOG("\n");
+    llama_perf_context_print(ctx);
+
+    llama_batch_free(batch);
+
+    llama_free(ctx);
+    llama_model_free(model);
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/llama.cpp/tools/cli/CMakeLists.txt b/llama.cpp/tools/cli/CMakeLists.txt
new file mode 100644
index 0000000..b08fff4
--- /dev/null
+++ b/llama.cpp/tools/cli/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(TARGET llama-cli)
+add_executable(${TARGET} cli.cpp)
+target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+include_directories(../server)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/llama.cpp/tools/cli/README.md b/llama.cpp/tools/cli/README.md
new file mode 100644
index 0000000..4a15cba
--- /dev/null
+++ b/llama.cpp/tools/cli/README.md
@@ -0,0 +1,192 @@
+# llama.cpp/tools/cli
+
+## Usage
+
+<!-- HELP_START -->
+
+<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->
+
+### Common params
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `-h, --help, --usage` | print usage and exit |
+| `--version` | show version and build info |
+| `--license` | show source code license and dependencies |
+| `-cl, --cache-list` | show list of models in cache |
+| `--completion-bash` | print source-able bash completion script for llama.cpp |
+| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
+| `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
+| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
+| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
+| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
+| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0) |
+| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0) |
+| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50) |
+| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
+| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
+| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
+| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
+| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
+| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
+| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)<br/>(env: LLAMA_ARG_N_PREDICT) |
+| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
+| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
+| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
+| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
+| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
+| `-p, --prompt PROMPT` | prompt to start generation with; for system message, use -sys |
+| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
+| `-f, --file FNAME` | a file containing the prompt (default: none) |
+| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
+| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
+| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
+| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
+| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
+| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
+| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
+| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
+| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
+| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
+| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
+| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
+| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
+| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
+| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
+| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
+| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
+| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
+| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
+| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
+| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
+| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
+| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
+| `--list-devices` | print list of available devices and exit |
+| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type<br/>(env: LLAMA_ARG_OVERRIDE_TENSOR) |
+| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
+| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
+| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
+| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
+| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
+| `-fitt, --fit-target MiB0,MiB1,MiB2,...` | target margin per device for --fit, comma-separated list of values, single value is broadcast across all devices, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
+| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
+| `--check-tensors` | check model tensor data for invalid values (default: false) |
+| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
+| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
+| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
+| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
+| `--control-vector FNAME` | add a control vector<br/>note: use comma-separated values to add multiple control vectors |
+| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE<br/>note: use comma-separated values (format: FNAME:SCALE,...) |
+| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
+| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
+| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
+| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
+| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
+| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
+| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
+| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
+| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
+| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
+| `--log-disable` | Log disable |
+| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
+| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
+| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
+| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
+| `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
+| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
+| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
+| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
+
+
+### Sampling params
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
+| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
+| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
+| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
+| `--temp N` | temperature (default: 0.80) |
+| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
+| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
+| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
+| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
+| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
+| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
+| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
+| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
+| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
+| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
+| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) |
+| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) |
+| `--dry-base N` | set DRY sampling base value (default: 1.75) |
+| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
+| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
+| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
+| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) |
+| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.<br/>(valid range 0.0 to 0.99) (default: 0.90) |
+| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) |
+| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) |
+| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
+| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) |
+| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) |
+| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
+| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
+| `--grammar-file FNAME` | file to read grammar from |
+| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
+| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
+| `-bs, --backend-sampling` | enable backend sampling (experimental) (default: disabled)<br/>(env: LLAMA_ARG_BACKEND_SAMPLING) |
+
+
+### CLI-specific params
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) |
+| `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal |
+| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
+| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
+| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
+| `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) |
+| `--show-timings, --no-show-timings` | whether to show timing information after each response (default: true)<br/>(env: LLAMA_ARG_SHOW_TIMINGS) |
+| `-sysf, --system-prompt-file FNAME` | a file containing the system prompt (default: none) |
+| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
+| `-sp, --special` | special tokens output enabled (default: false) |
+| `-cnv, --conversation, -no-cnv, --no-conversation` | whether to run in conversation mode:<br/>- does not print special tokens and suffix/prefix<br/>- interactive mode is also enabled<br/>(default: auto enabled if chat template is available) |
+| `-st, --single-turn` | run conversation for a single turn only, then exit when done<br/>will not be interactive if first turn is predefined with --prompt<br/>(default: false) |
+| `-mli, --multiline-input` | allows you to write or paste multiple lines without ending each in '\' |
+| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
+| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
+| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
+| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
+| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
+| `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files |
+| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
+| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
+| `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
+| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
+| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
+| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
+| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
+| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
+| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
+| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
+| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
+| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
+| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
+| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
+| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
+| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
+| `--gpt-oss-20b-default` | use gpt-oss-20b (note: can download weights from the internet) |
+| `--gpt-oss-120b-default` | use gpt-oss-120b (note: can download weights from the internet) |
+| `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) |
+| `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) |
+
+<!-- HELP_END -->
diff --git a/llama.cpp/tools/cli/cli.cpp b/llama.cpp/tools/cli/cli.cpp
new file mode 100644
index 0000000..02ccb72
--- /dev/null
+++ b/llama.cpp/tools/cli/cli.cpp
@@ -0,0 +1,421 @@
+#include "common.h"
+#include "arg.h"
+#include "console.h"
+// #include "log.h"
+
+#include "server-context.h"
+#include "server-task.h"
+
+#include <atomic>
+#include <fstream>
+#include <thread>
+#include <signal.h>
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+const char * LLAMA_ASCII_LOGO = R"(
+▄▄ ▄▄
+██ ██
+██ ██  ▀▀█▄ ███▄███▄  ▀▀█▄    ▄████ ████▄ ████▄
+██ ██ ▄█▀██ ██ ██ ██ ▄█▀██    ██    ██ ██ ██ ██
+██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
+                                    ██    ██
+                                    ▀▀    ▀▀
+)";
+
+static std::atomic<bool> g_is_interrupted = false;
+static bool should_stop() {
+    return g_is_interrupted.load();
+}
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+static void signal_handler(int) {
+    if (g_is_interrupted.load()) {
+        // second Ctrl+C - exit immediately
+        // make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock)
+        fprintf(stdout, "\033[0m\n");
+        fflush(stdout);
+        std::exit(130);
+    }
+    g_is_interrupted.store(true);
+}
+#endif
+
+struct cli_context {
+    server_context ctx_server;
+    json messages = json::array();
+    std::vector<raw_buffer> input_files;
+    task_params defaults;
+
+    // thread for showing "loading" animation
+    std::atomic<bool> loading_show;
+
+    cli_context(const common_params & params) {
+        defaults.sampling    = params.sampling;
+        defaults.speculative = params.speculative;
+        defaults.n_keep      = params.n_keep;
+        defaults.n_predict   = params.n_predict;
+        defaults.antiprompt  = params.antiprompt;
+
+        defaults.stream = true; // make sure we always use streaming mode
+        defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
+        // defaults.return_progress = true; // TODO: show progress
+    }
+
+    std::string generate_completion(result_timings & out_timings) {
+        server_response_reader rd = ctx_server.get_response_reader();
+        auto chat_params = format_chat();
+        {
+            // TODO: reduce some copies here in the future
+            server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
+            task.id         = rd.get_new_id();
+            task.index      = 0;
+            task.params     = defaults;           // copy
+            task.cli_prompt = chat_params.prompt; // copy
+            task.cli_files  = input_files;        // copy
+            task.cli        = true;
+
+            // chat template settings
+            task.params.chat_parser_params = common_chat_parser_params(chat_params);
+            task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+            if (!chat_params.parser.empty()) {
+                task.params.chat_parser_params.parser.load(chat_params.parser);
+            }
+
+            rd.post_task({std::move(task)});
+        }
+
+        // wait for first result
+        console::spinner::start();
+        server_task_result_ptr result = rd.next(should_stop);
+
+        console::spinner::stop();
+        std::string curr_content;
+        bool is_thinking = false;
+
+        while (result) {
+            if (should_stop()) {
+                break;
+            }
+            if (result->is_error()) {
+                json err_data = result->to_json();
+                if (err_data.contains("message")) {
+                    console::error("Error: %s\n", err_data["message"].get<std::string>().c_str());
+                } else {
+                    console::error("Error: %s\n", err_data.dump().c_str());
+                }
+                return curr_content;
+            }
+            auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get());
+            if (res_partial) {
+                out_timings = std::move(res_partial->timings);
+                for (const auto & diff : res_partial->oaicompat_msg_diffs) {
+                    if (!diff.content_delta.empty()) {
+                        if (is_thinking) {
+                            console::log("\n[End thinking]\n\n");
+                            console::set_display(DISPLAY_TYPE_RESET);
+                            is_thinking = false;
+                        }
+                        curr_content += diff.content_delta;
+                        console::log("%s", diff.content_delta.c_str());
+                        console::flush();
+                    }
+                    if (!diff.reasoning_content_delta.empty()) {
+                        console::set_display(DISPLAY_TYPE_REASONING);
+                        if (!is_thinking) {
+                            console::log("[Start thinking]\n");
+                        }
+                        is_thinking = true;
+                        console::log("%s", diff.reasoning_content_delta.c_str());
+                        console::flush();
+                    }
+                }
+            }
+            auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
+            if (res_final) {
+                out_timings = std::move(res_final->timings);
+                break;
+            }
+            result = rd.next(should_stop);
+        }
+        g_is_interrupted.store(false);
+        // server_response_reader automatically cancels pending tasks upon destruction
+        return curr_content;
+    }
+
+    // TODO: support remote files in the future (http, https, etc)
+    std::string load_input_file(const std::string & fname, bool is_media) {
+        std::ifstream file(fname, std::ios::binary);
+        if (!file) {
+            return "";
+        }
+        if (is_media) {
+            raw_buffer buf;
+            buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+            input_files.push_back(std::move(buf));
+            return mtmd_default_marker();
+        } else {
+            std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+            return content;
+        }
+    }
+
+    common_chat_params format_chat() {
+        auto meta = ctx_server.get_meta();
+        auto & chat_params = meta.chat_params;
+
+        common_chat_templates_inputs inputs;
+        inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
+        inputs.tools                 = {}; // TODO
+        inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
+        inputs.json_schema           = ""; // TODO
+        inputs.grammar               = ""; // TODO
+        inputs.use_jinja             = chat_params.use_jinja;
+        inputs.parallel_tool_calls   = false;
+        inputs.add_generation_prompt = true;
+        inputs.enable_thinking       = chat_params.enable_thinking;
+
+        // Apply chat template to the list of messages
+        return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
+    }
+};
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) {
+        return 1;
+    }
+
+    // TODO: maybe support it later?
+    if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) {
+        console::error("--no-conversation is not supported by llama-cli\n");
+        console::error("please use llama-completion instead\n");
+    }
+
+    common_init();
+
+    // struct that contains llama context and inference
+    cli_context ctx_cli(params);
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // TODO: avoid using atexit() here by making `console` a singleton
+    console::init(params.simple_io, params.use_color);
+    atexit([]() { console::cleanup(); });
+
+    console::set_display(DISPLAY_TYPE_RESET);
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = signal_handler;
+    sigemptyset (&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+    sigaction(SIGTERM, &sigint_action, NULL);
+#elif defined (_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
+    console::log("\nLoading model... "); // followed by loading animation
+    console::spinner::start();
+    if (!ctx_cli.ctx_server.load_model(params)) {
+        console::spinner::stop();
+        console::error("\nFailed to load the model\n");
+        return 1;
+    }
+
+    console::spinner::stop();
+    console::log("\n");
+
+    std::thread inference_thread([&ctx_cli]() {
+        ctx_cli.ctx_server.start_loop();
+    });
+
+    auto inf = ctx_cli.ctx_server.get_meta();
+    std::string modalities = "text";
+    if (inf.has_inp_image) {
+        modalities += ", vision";
+    }
+    if (inf.has_inp_audio) {
+        modalities += ", audio";
+    }
+
+    if (!params.system_prompt.empty()) {
+        ctx_cli.messages.push_back({
+            {"role",    "system"},
+            {"content", params.system_prompt}
+        });
+    }
+
+    console::log("\n");
+    console::log("%s\n", LLAMA_ASCII_LOGO);
+    console::log("build      : %s\n", inf.build_info.c_str());
+    console::log("model      : %s\n", inf.model_name.c_str());
+    console::log("modalities : %s\n", modalities.c_str());
+    if (!params.system_prompt.empty()) {
+        console::log("using custom system prompt\n");
+    }
+    console::log("\n");
+    console::log("available commands:\n");
+    console::log("  /exit or Ctrl+C     stop or exit\n");
+    console::log("  /regen              regenerate the last response\n");
+    console::log("  /clear              clear the chat history\n");
+    console::log("  /read               add a text file\n");
+    if (inf.has_inp_image) {
+        console::log("  /image <file>       add an image file\n");
+    }
+    if (inf.has_inp_audio) {
+        console::log("  /audio <file>       add an audio file\n");
+    }
+    console::log("\n");
+
+    // interactive loop
+    std::string cur_msg;
+    while (true) {
+        std::string buffer;
+        console::set_display(DISPLAY_TYPE_USER_INPUT);
+        if (params.prompt.empty()) {
+            console::log("\n> ");
+            std::string line;
+            bool another_line = true;
+            do {
+                another_line = console::readline(line, params.multiline_input);
+                buffer += line;
+            } while (another_line);
+        } else {
+            // process input prompt from args
+            for (auto & fname : params.image) {
+                std::string marker = ctx_cli.load_input_file(fname, true);
+                if (marker.empty()) {
+                    console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
+                    break;
+                }
+                console::log("Loaded media from '%s'\n", fname.c_str());
+                cur_msg += marker;
+            }
+            buffer = params.prompt;
+            if (buffer.size() > 500) {
+                console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str());
+            } else {
+                console::log("\n> %s\n", buffer.c_str());
+            }
+            params.prompt.clear(); // only use it once
+        }
+        console::set_display(DISPLAY_TYPE_RESET);
+        console::log("\n");
+
+        if (should_stop()) {
+            g_is_interrupted.store(false);
+            break;
+        }
+
+        // remove trailing newline
+        if (!buffer.empty() &&buffer.back() == '\n') {
+            buffer.pop_back();
+        }
+
+        // skip empty messages
+        if (buffer.empty()) {
+            continue;
+        }
+
+        bool add_user_msg = true;
+
+        // process commands
+        if (string_starts_with(buffer, "/exit")) {
+            break;
+        } else if (string_starts_with(buffer, "/regen")) {
+            if (ctx_cli.messages.size() >= 2) {
+                size_t last_idx = ctx_cli.messages.size() - 1;
+                ctx_cli.messages.erase(last_idx);
+                add_user_msg = false;
+            } else {
+                console::error("No message to regenerate.\n");
+                continue;
+            }
+        } else if (string_starts_with(buffer, "/clear")) {
+            ctx_cli.messages.clear();
+            ctx_cli.input_files.clear();
+            console::log("Chat history cleared.\n");
+            continue;
+        } else if (
+                (string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
+                (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) {
+            // just in case (bad copy-paste for example), we strip all trailing/leading spaces
+            std::string fname = string_strip(buffer.substr(7));
+            std::string marker = ctx_cli.load_input_file(fname, true);
+            if (marker.empty()) {
+                console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
+                continue;
+            }
+            cur_msg += marker;
+            console::log("Loaded media from '%s'\n", fname.c_str());
+            continue;
+        } else if (string_starts_with(buffer, "/read ")) {
+            std::string fname = string_strip(buffer.substr(6));
+            std::string marker = ctx_cli.load_input_file(fname, false);
+            if (marker.empty()) {
+                console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
+                continue;
+            }
+            cur_msg += marker;
+            console::log("Loaded text from '%s'\n", fname.c_str());
+            continue;
+        } else {
+            // not a command
+            cur_msg += buffer;
+        }
+
+        // generate response
+        if (add_user_msg) {
+            ctx_cli.messages.push_back({
+                {"role",    "user"},
+                {"content", cur_msg}
+            });
+            cur_msg.clear();
+        }
+        result_timings timings;
+        std::string assistant_content = ctx_cli.generate_completion(timings);
+        ctx_cli.messages.push_back({
+            {"role",    "assistant"},
+            {"content", assistant_content}
+        });
+        console::log("\n");
+
+        if (params.show_timings) {
+            console::set_display(DISPLAY_TYPE_INFO);
+            console::log("\n");
+            console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second);
+            console::set_display(DISPLAY_TYPE_RESET);
+        }
+
+        if (params.single_turn) {
+            break;
+        }
+    }
+
+    console::set_display(DISPLAY_TYPE_RESET);
+
+    console::log("\nExiting...\n");
+    ctx_cli.ctx_server.terminate();
+    inference_thread.join();
+
+    // bump the log level to display timings
+    common_log_set_verbosity_thold(LOG_LEVEL_INFO);
+    llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
+
+    return 0;
+}
diff --git a/llama.cpp/tools/completion/CMakeLists.txt b/llama.cpp/tools/completion/CMakeLists.txt
new file mode 100644
index 0000000..126ae6a
--- /dev/null
+++ b/llama.cpp/tools/completion/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-completion)
+add_executable(${TARGET} completion.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/llama.cpp/tools/completion/README.md b/llama.cpp/tools/completion/README.md
new file mode 100644
index 0000000..3ca3e68
--- /dev/null
+++ b/llama.cpp/tools/completion/README.md
@@ -0,0 +1,578 @@
+# llama.cpp/tools/completion
+
+This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggml-org/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+
+## Table of Contents
+
+1. [Quick Start](#quick-start)
+2. [Usage](#usage)
+3. [Common Options](#common-options)
+4. [Input Prompts](#input-prompts)
+5. [Interaction](#interaction)
+6. [Context Management](#context-management)
+7. [Generation Flags](#generation-flags)
+8. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
+9. [Additional Options](#additional-options)
+
+## Quick Start
+
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+
+First, we will need to download a model. In these examples, we will use the Gemma model from the ggml-org repo on Hugging Face.
+[https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)
+
+Once downloaded, place your model in the models folder in llama.cpp.
+
+### Unix-based systems (Linux, macOS, etc.):
+
+##### Input prompt (One-and-done)
+
+```bash
+./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
+```
+##### Conversation mode (Allow for continuous interaction with the model)
+
+```bash
+./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
+```
+
+##### Conversation mode using built-in jinja chat template
+
+```bash
+./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja
+```
+
+##### One-and-done query using jinja with custom system prompt and a starting prompt
+
+```bash
+./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
+```
+
+##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
+```bash
+./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
+```
+
+### Windows:
+
+##### Input prompt (One-and-done)
+```powershell
+./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
+```
+##### Conversation mode (Allow for continuous interaction with the model)
+
+```powershell
+./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
+```
+
+##### Conversation mode using built-in jinja chat template
+
+```powershell
+./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja
+```
+
+##### One-and-done query using jinja with custom system prompt and a starting prompt
+
+```powershell
+./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
+```
+
+#### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
+
+```powershell
+llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
+```
+
+## Usage
+
+<!-- HELP_START -->
+
+<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->
+
+### Common params
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `-h, --help, --usage` | print usage and exit |
+| `--version` | show version and build info |
+| `--license` | show source code license and dependencies |
+| `-cl, --cache-list` | show list of models in cache |
+| `--completion-bash` | print source-able bash completion script for llama.cpp |
+| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
+| `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
+| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
+| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
+| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
+| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0) |
+| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0) |
+| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50) |
+| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
+| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
+| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
+| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
+| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
+| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
+| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
+| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
+| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
+| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
+| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
+| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
+| `-p, --prompt PROMPT` | prompt to start generation with; for system message, use -sys |
+| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
+| `-f, --file FNAME` | a file containing the prompt (default: none) |
+| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
+| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
+| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
+| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
+| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
+| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
+| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
+| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
+| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
+| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
+| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
+| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
+| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
+| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
+| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
+| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
+| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
+| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
+| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
+| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
+| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
+| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
+| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
+| `--list-devices` | print list of available devices and exit |
+| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type<br/>(env: LLAMA_ARG_OVERRIDE_TENSOR) |
+| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
+| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
+| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
+| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
+| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
+| `-fitt, --fit-target MiB0,MiB1,MiB2,...` | target margin per device for --fit, comma-separated list of values, single value is broadcast across all devices, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
+| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
+| `--check-tensors` | check model tensor data for invalid values (default: false) |
+| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
+| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
+| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
+| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
+| `--control-vector FNAME` | add a control vector<br/>note: use comma-separated values to add multiple control vectors |
+| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE<br/>note: use comma-separated values (format: FNAME:SCALE,...) |
+| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
+| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
+| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
+| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
+| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
+| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
+| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
+| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
+| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
+| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
+| `--log-disable` | Log disable |
+| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
+| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
+| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
+| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
+| `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
+| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
+| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
+| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
+
+
+### Sampling params
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
+| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
+| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
+| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
+| `--temp N` | temperature (default: 0.80) |
+| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
+| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
+| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
+| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
+| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
+| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
+| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
+| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
+| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
+| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
+| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) |
+| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) |
+| `--dry-base N` | set DRY sampling base value (default: 1.75) |
+| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
+| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
+| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
+| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) |
+| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.<br/>(valid range 0.0 to 0.99) (default: 0.90) |
+| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) |
+| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) |
+| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
+| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) |
+| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) |
+| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
+| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
+| `--grammar-file FNAME` | file to read grammar from |
+| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
+| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
+| `-bs, --backend-sampling` | enable backend sampling (experimental) (default: disabled)<br/>(env: LLAMA_ARG_BACKEND_SAMPLING) |
+
+
+### Completion-specific params
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) |
+| `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal |
+| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
+| `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) |
+| `-sysf, --system-prompt-file FNAME` | a file containing the system prompt (default: none) |
+| `-ptc, --print-token-count N` | print token count every N tokens (default: -1) |
+| `--prompt-cache FNAME` | file to cache prompt state for faster startup (default: none) |
+| `--prompt-cache-all` | if specified, saves user input and generations to cache as well |
+| `--prompt-cache-ro` | if specified, uses the prompt cache but does not update it |
+| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
+| `-sp, --special` | special tokens output enabled (default: false) |
+| `-cnv, --conversation, -no-cnv, --no-conversation` | whether to run in conversation mode:<br/>- does not print special tokens and suffix/prefix<br/>- interactive mode is also enabled<br/>(default: auto enabled if chat template is available) |
+| `-st, --single-turn` | run conversation for a single turn only, then exit when done<br/>will not be interactive if first turn is predefined with --prompt<br/>(default: false) |
+| `-i, --interactive` | run in interactive mode (default: false) |
+| `-if, --interactive-first` | run in interactive mode and wait for input right away (default: false) |
+| `-mli, --multiline-input` | allows you to write or paste multiple lines without ending each in '\' |
+| `--in-prefix-bos` | prefix BOS to user inputs, preceding the `--in-prefix` string |
+| `--in-prefix STRING` | string to prefix user inputs with (default: empty) |
+| `--in-suffix STRING` | string to suffix after user inputs with (default: empty) |
+| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
+| `-gan, --grp-attn-n N` | group-attention factor (default: 1)<br/>(env: LLAMA_ARG_GRP_ATTN_N) |
+| `-gaw, --grp-attn-w N` | group-attention width (default: 512)<br/>(env: LLAMA_ARG_GRP_ATTN_W) |
+| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
+| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
+| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
+
+<!-- HELP_END -->
+
+## Common Options
+
+In this section, we cover the most commonly used options for running the `llama-completion` program with the LLaMA models:
+
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/gemma-1.1-7b-it.Q4_K_M.gguf`; inferred from `--model-url` if set).
+-   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
+-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
+-   `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
+-   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
+-   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+
+## Input Prompts
+
+The `llama-completion` program provides several ways to interact with the LLaMA models using input prompts:
+
+-   `--prompt PROMPT`: Provide a prompt directly as a command-line option.
+-   `--file FNAME`: Provide a file containing a prompt or multiple prompts.
+-   `--system-prompt PROMPT`: Provide a system prompt (will otherwise use the default one in the chat template (if provided)).
+-   `--system-prompt-file FNAME`: Provide a file containing a system prompt.
+-   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
+
+## Interaction
+
+The `llama-completion` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
+
+In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
+
+### Interaction Options
+
+-   `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
+-   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
+-   `-cnv,  --conversation`:  Run the program in conversation mode (does not print special tokens and suffix/prefix, use default or provided chat template) (default: true if chat template found)
+-   `-no-cnv`:  Disable conversation mode (default: false)
+-   `-st, --single-turn`:  Only process a single conversation turn (user input) and then exit.
+-   `--jinja`:  Enable jinja chat template parser, will use the model's built-in template or a user-provided one (default: false)
+-   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
+
+By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
+
+### Reverse Prompts
+
+Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered:
+
+-   `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space.
+
+To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt.
+
+### In-Prefix
+
+The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
+
+```sh
+./llama-completion -r "User:" --in-prefix " "
+```
+
+### In-Suffix
+
+The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
+
+```sh
+./llama-completion -r "User:" --in-prefix " " --in-suffix "Assistant:"
+```
+When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled
+
+### Chat templates
+
+ `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name.  Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
+
+ Example usage: `--chat-template gemma`
+
+`--chat-template-file FNAME`:  Load a custom jinja chat template from an external file, useful if the model contains outdated or incompatible template, some examples can be found in models/templates. Up-to-date chat templates can be downloaded from Hugging Face using scripts/get_chat_template.py
+
+## Context Management
+
+During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
+
+### Context Size
+
+- `-c N, --ctx-size N`: Set the size of the prompt context (default: 4096, 0 = loaded from model). If a LLaMA model was built with a longer context, increasing this value will yield the best results on longer input/inference.
+
+### Extended Context Size
+
+Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model has a context length (max sequence length) of 4096 (4k) and the fine-tuned model has 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
+
+-   `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
+
+### Keep Prompt
+
+The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
+
+-   `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
+
+By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
+
+## Generation Flags
+
+The following options allow you to control the text generation process and fine-tune the diversity, creativity, and quality of the generated text according to your needs. By adjusting these options and experimenting with different combinations of values, you can find the best settings for your specific use case.
+
+### Number of Tokens to Predict
+
+-   `-n N, --predict N`: Set the number of tokens to predict when generating text (default: -1, -1 = infinity, -2 = until context filled)
+
+The `--predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
+
+A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in a significant pause in output.
+
+If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
+
+The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full.
+
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
+
+### Temperature
+
+-   `--temp N`: Adjust the randomness of the generated text (default: 0.8).
+
+Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
+
+Example usage: `--temp 0`
+
+### Repeat Penalty
+
+-   `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled).
+-   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
+
+The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.
+
+The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
+
+### DRY Repetition Penalty
+
+DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)).
+
+- `--dry-multiplier N`: Set the DRY sampling multiplier (default: 0.0, 0.0 = disabled).
+- `--dry-base N`: Set the DRY sampling base value (default: 1.75).
+- `--dry-allowed-length N`: Set the allowed length for DRY sampling (default: 2).
+- `--dry-penalty-last-n N`: Set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size).
+- `--dry-sequence-breaker STRING`: Add a sequence breaker for DRY sampling. Can be used more than once to add multiple sequence breakers. Using this clears out the default breakers, which consist of: `['\n', ':', '"', '*']`. If the string `"none"` is supplied, no sequence breakers are used.
+
+The `dry-multiplier` option controls the strength of the DRY sampling effect. A value of 0.0 disables DRY sampling, while higher values increase its influence. A typical recommended value is 0.8.
+
+The `dry-base` option sets the base value for the exponential penalty calculation in DRY sampling. Higher values lead to more aggressive penalization of repetitions.
+
+The `dry-allowed-length` option sets the maximum length of repeated sequences that will not be penalized. Repetitions shorter than or equal to this length are not penalized, allowing for natural repetitions of short phrases or common words.
+
+The `dry-penalty-last-n` option controls how many recent tokens to consider when applying the DRY penalty. A value of -1 considers the entire context. Use a positive value to limit the consideration to a specific number of recent tokens.
+
+The `dry-sequence-breaker` option adds a single sequence breaker and can be used more than once to specify multiple sequence breakers. Sequence breakers interrupt sequence matching and break the input into parts where matching can be applied.
+
+DRY sampling provides more nuanced control over text generation, particularly for reducing long-range repetitions and maintaining global coherence.
+
+Example usage: `--dry-multiplier 0.8 --dry-base 1.75 --dry-allowed-length 2 --dry-penalty-last-n -1 --dry-sequence-breaker "—" --dry-sequence-breaker "##"`
+
+### Top-K Sampling
+
+-   `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).
+
+Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
+
+Example usage: `--top-k 30`
+
+### Top-P Sampling
+
+-   `--top-p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+
+Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
+
+Example usage: `--top-p 0.95`
+
+### Min-P Sampling
+
+-   `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.1).
+
+The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
+
+Example usage: `--min-p 0.05`
+
+### Adaptive-P Sampling
+
+-   `--adaptive-target N`: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
+-   `--adaptive-decay N`: EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
+
+Adaptive-P: Select tokens near a configurable target probability over time.
+
+The adaptive-p sampler transforms the token probability distribution to favor tokens that fall near a user-configurable probability target. Internally, the sampler maintains an exponential moving average of the *ORIGINAL* probabilities of selected tokens at each sampling step. It uses this EMA to compute an adapted target probability at each sampling step, thus maintaining the desired target probability over time. Only mild truncation before this sampler is recommended. It is suggested to apply min-p before adaptive-p as the only other active sampler.
+
+Recommended starting values: `--adaptive-target 0.55 --adaptive-decay 0.9`
+
+For more info, refer to: [llama.cpp#17927](https://github.com/ggml-org/llama.cpp/pull/17927)
+
+### Locally Typical Sampling
+
+-   `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
+
+Locally typical sampling promotes the generation of contextually coherent and diverse text by sampling tokens that are typical or expected based on the surrounding context. By setting the parameter p between 0 and 1, you can control the balance between producing text that is locally coherent and diverse. A value closer to 1 will promote more contextually coherent tokens, while a value closer to 0 will promote more diverse tokens. A value equal to 1 disables locally typical sampling.
+
+Example usage: `--typical 0.9`
+
+### Mirostat Sampling
+
+-   `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
+-   `--mirostat-lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
+-   `--mirostat-ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
+
+Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).
+
+The `--mirostat-lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
+
+The `--mirostat-ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
+
+Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
+
+### XTC Sampling
+
+-   `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
+-   `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
+
+Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
+
+By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
+
+Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
+
+Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1`
+
+### Top-nσ Sampling
+
+-   `--top-nsigma N`: Limit the next token selection to a subset of tokens with pre-softmax logits that are within n * σ less than the max logit (default: -1, -1 = disabled).
+
+Top-nσ sampling is a text generation method that selects tokens based on a statistical threshold in pre-softmax logits. It works by only sampling from tokens with logits that are within n * σ of the maximum logit. This method helps maintain a stable sampling space regardless of temperature scaling, allowing it to perform well on reasoning tasks even in high temperatures. Without complex probability manipulation, it efficiently filters tokens directly on the pre-softmax logits. A higher value for top-nsigma (e.g., 5) will take more noisy tokens into consideration, while a lower value (e.g., 1) will focous on the more informative region of the sampling space.
+
+Example usage: `--top-nsigma 1`
+
+### Logit Bias
+
+-   `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
+
+The logit bias option allows you to manually adjust the likelihood of specific tokens appearing in the generated text. By providing a token ID and a positive or negative bias value, you can increase or decrease the probability of that token being generated.
+
+For example, use `--logit-bias 15043+1` to increase the likelihood of the token 'Hello', or `--logit-bias 15043-1` to decrease its likelihood. Using a value of negative infinity, `--logit-bias 15043-inf` ensures that the token `Hello` is never produced.
+
+A more practical use case might be to prevent the generation of `\code{begin}` and `\code{end}` by setting the `\` token (29905) to negative infinity with `-l 29905-inf`. (This is due to the prevalence of LaTeX codes that show up in LLaMA model inference.)
+
+Example usage: `--logit-bias 29905-inf`
+
+### RNG Seed
+
+-   `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
+
+The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
+
+## Performance Tuning and Memory Options
+
+These options help improve the performance and memory usage of the LLaMA models. By adjusting these settings, you can fine-tune the model's behavior to better suit your system's capabilities and achieve optimal performance for your specific use case.
+
+### Number of Threads
+
+-   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
+-   `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. In some systems, it is beneficial to use a higher number of threads during batch processing than during generation. If not specified, the number of threads used for batch processing will be the same as the number of threads used for generation.
+
+### Mlock
+
+-   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. This can improve performance but trades away some of the advantages of memory-mapping by requiring more RAM to run and potentially slowing down load times as the model loads into RAM.
+
+### No Memory Mapping
+
+-   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
+
+### NUMA support
+
+-   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
+-   `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
+-   `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
+
+ These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
+
+### Batch Size
+
+- `-ub N`, `--ubatch-size N`: Physical batch size. This is the maximum number of tokens that may be processed at a time. Increasing this value may improve performance during prompt processing, at the expense of higher memory usage. Default: `512`.
+
+- `-b N`, `--batch-size N`: Logical batch size. Increasing this value above the value of the physical batch size may improve prompt processing performance when using multiple GPUs with pipeline parallelism. Default: `2048`.
+
+### Prompt Caching
+
+-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
+
+### Grammars & JSON schemas
+
+-   `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
+
+-   `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead.
+
+### Quantization
+
+For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
+
+## LoRA (Low-Rank Adaptation) adapters
+
+-   `--lora FNAME`: Optional path to a LoRA adapter to use with scaling of 1.0. Can be mixed with `--lora-scaled` and can be repeated to use multiple adapters.
+-   `--lora-scaled FNAME`: Optional path to a LoRA adapter with user-defined scaling. Can be mixed with `--lora` and can repeated to use multiple adapters.
+
+You can add LoRA adapters using `--lora` or `--lora-scaled`. For example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...` or `--lora-scaled lora_task_A.gguf 0.5 --lora-scaled lora_task_B.gguf 0.5`.
+
+LoRA adapters should be in GGUF format. To convert from Hugging Face format use the `convert-lora-to-gguf.py` script. LoRA adapters are loaded separately and applied during inference - they are not merged with the main model. This means that mmap model loading is fully supported when using LoRA adapters. The old `--lora-base` flag has been removed now that merging is no longer performed.
+
+## Additional Options
+
+These options provide extra functionality and customization when running the LLaMA models:
+
+-   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
+-   `--verbose-prompt`: Print the prompt before generating text.
+-   `--no-display-prompt`: Don't print prompt at generation.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple devices this option controls how tensors should be split across devices. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each device should get in order. For example, "3,2" will assign 60% of the data to device 0 and 40% to device 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance. The list of the devices which are being used is printed on startup and can be different from the device list given by `--list-devices` or e.g. `nvidia-smi`.
+-   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.
diff --git a/llama.cpp/tools/completion/completion.cpp b/llama.cpp/tools/completion/completion.cpp
new file mode 100644
index 0000000..9771327
--- /dev/null
+++ b/llama.cpp/tools/completion/completion.cpp
@@ -0,0 +1,1001 @@
+#include "arg.h"
+#include "common.h"
+#include "console.h"
+#include "log.h"
+#include "sampling.h"
+#include "llama.h"
+#include "chat.h"
+
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <signal.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static llama_context           ** g_ctx;
+static llama_model             ** g_model;
+static common_sampler          ** g_smpl;
+static common_params            * g_params;
+static std::vector<llama_token> * g_input_tokens;
+static std::ostringstream       * g_output_ss;
+static std::vector<llama_token> * g_output_tokens;
+static bool is_interacting  = false;
+static bool need_insert_eot = false;
+
+static void print_usage(int argc, char ** argv) {
+    (void) argc;
+
+    LOG("\nexample usage:\n");
+    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128 -no-cnv\n", argv[0]);
+    LOG("\n  chat (conversation): %s -m your_model.gguf -sys \"You are a helpful assistant\"\n", argv[0]);
+    LOG("\n");
+}
+
+static bool file_exists(const std::string & path) {
+    std::ifstream f(path.c_str());
+    return f.good();
+}
+
+static bool file_is_empty(const std::string & path) {
+    std::ifstream f;
+    f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+    f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
+    return f.tellg() == 0;
+}
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+static void sigint_handler(int signo) {
+    if (signo == SIGINT) {
+        if (!is_interacting && g_params->interactive) {
+            is_interacting  = true;
+            need_insert_eot = true;
+        } else {
+            console::cleanup();
+            LOG("\n");
+            common_perf_print(*g_ctx, *g_smpl);
+
+            // make sure all logs are flushed
+            LOG("Interrupted by user\n");
+            common_log_pause(common_log_main());
+
+            _exit(130);
+        }
+    }
+}
+#endif
+
+int main(int argc, char ** argv) {
+    common_params params;
+    g_params = &params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
+        return 1;
+    }
+
+    common_init();
+
+    auto & sparams = params.sampling;
+
+    // save choice to use color for later
+    // (note for later: this is a slightly awkward choice)
+    console::init(params.simple_io, params.use_color);
+    atexit([]() { console::cleanup(); });
+
+    if (params.embedding) {
+        LOG_ERR("************\n");
+        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        LOG_ERR("************\n\n");
+
+        return 0;
+    }
+
+    if (params.n_ctx != 0 && params.n_ctx < 8) {
+        LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+
+    if (params.rope_freq_base != 0.0) {
+        LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+    }
+
+    if (params.rope_freq_scale != 0.0) {
+        LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+    }
+
+    LOG_INF("%s: llama backend init\n", __func__);
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+    common_sampler * smpl = nullptr;
+
+    g_model = &model;
+    g_ctx = &ctx;
+    g_smpl = &smpl;
+
+    std::vector<common_chat_msg> chat_msgs;
+
+    // load the model and apply lora adapter, if any
+    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
+
+    auto llama_init = common_init_from_params(params);
+
+    ctx   = llama_init->context();
+    model = llama_init->model();
+    smpl  = llama_init->sampler(0);
+
+    if (ctx == NULL) {
+        LOG_ERR("%s: error: unable to create context\n", __func__);
+        return 1;
+    }
+
+    llama_memory_t mem = llama_get_memory(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    // note: the time for chat template initialization is not negligible:
+    auto chat_templates = common_chat_templates_init(model, params.chat_template);
+
+    // start measuring performance timings from here
+    llama_perf_context_reset(ctx);
+
+    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
+
+    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (!cpu_dev) {
+        LOG_ERR("%s: no CPU backend found\n", __func__);
+        return 1;
+    }
+    auto * reg = ggml_backend_dev_backend_reg(cpu_dev);
+    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
+    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
+
+    struct ggml_threadpool_params tpp_batch =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
+    struct ggml_threadpool_params tpp =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams);
+
+    if (!set_process_priority(params.cpuparams.priority)) {
+        LOG_ERR("%s: error: failed to set process priority\n", __func__);
+        return 1;
+    }
+
+    struct ggml_threadpool * threadpool_batch = NULL;
+    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
+        threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
+        if (!threadpool_batch) {
+            LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+            return 1;
+        }
+
+        // start the non-batch threadpool in the paused state
+        tpp.paused = true;
+    }
+
+    struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
+    if (!threadpool) {
+        LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        return 1;
+    }
+
+    llama_attach_threadpool(ctx, threadpool, threadpool_batch);
+
+    const int n_ctx_train = llama_model_n_ctx_train(model);
+    const int n_ctx = llama_n_ctx(ctx);
+
+    if (n_ctx > n_ctx_train) {
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
+    }
+
+    // auto enable conversation mode if chat template is available
+    const bool has_chat_template = common_chat_templates_was_explicit(chat_templates.get());
+    if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
+        if (has_chat_template) {
+            LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
+            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+        } else {
+            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+        }
+    }
+
+    // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning
+    if (params.conversation_mode && !has_chat_template) {
+        LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__);
+    }
+
+    // print chat template example in conversation mode
+    if (params.conversation_mode) {
+        if (params.enable_chat_template) {
+            if (!params.prompt.empty() && params.system_prompt.empty()) {
+                LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n");
+            }
+
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs).c_str());
+        } else {
+            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
+        }
+    }
+
+    // print system information
+    {
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+    }
+
+    std::string path_session = params.path_prompt_cache;
+    std::vector<llama_token> session_tokens;
+
+    if (!path_session.empty()) {
+        LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+        if (!file_exists(path_session)) {
+            LOG_INF("%s: session file does not exist, will create.\n", __func__);
+        } else if (file_is_empty(path_session)) {
+            LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
+        } else {
+            // The file exists and is not empty
+            session_tokens.resize(n_ctx);
+            size_t n_token_count_out = 0;
+            if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+                LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
+                return 1;
+            }
+            session_tokens.resize(n_token_count_out);
+            LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
+        }
+    }
+
+    const bool add_bos = llama_vocab_get_add_bos(vocab) && !params.use_jinja;
+    if (!llama_model_has_encoder(model)) {
+        GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+    }
+
+    LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
+
+    std::vector<llama_token> embd_inp;
+
+    bool waiting_for_first_input = false;
+    auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
+        common_chat_msg new_msg;
+        new_msg.role = role;
+        new_msg.content = content;
+        auto formatted = common_chat_format_single(chat_templates.get(), chat_msgs, new_msg, role == "user", g_params->use_jinja);
+        chat_msgs.push_back(new_msg);
+        LOG_DBG("formatted: '%s'\n", formatted.c_str());
+        return formatted;
+    };
+
+    std::string prompt;
+    {
+        if (params.conversation_mode && params.enable_chat_template) {
+            if (!params.system_prompt.empty()) {
+                // format the system prompt (will use template default if empty)
+                chat_add_and_format("system", params.system_prompt);
+            }
+
+            if (!params.prompt.empty()) {
+                // format and append the user prompt
+                chat_add_and_format("user", params.prompt);
+            } else {
+                waiting_for_first_input = true;
+            }
+
+            if (!params.system_prompt.empty() || !params.prompt.empty()) {
+                common_chat_templates_inputs inputs;
+                inputs.use_jinja = g_params->use_jinja;
+                inputs.messages = chat_msgs;
+                inputs.add_generation_prompt = !params.prompt.empty();
+
+                prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt;
+            }
+        } else {
+            // otherwise use the prompt as is
+            prompt = params.prompt;
+        }
+
+        if (params.interactive_first || !prompt.empty() || session_tokens.empty()) {
+            LOG_DBG("tokenize the prompt\n");
+            embd_inp = common_tokenize(ctx, prompt, true, true);
+        } else {
+            LOG_DBG("use session tokens\n");
+            embd_inp = session_tokens;
+        }
+
+        LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
+        LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
+    }
+
+    // Should not run without any tokens
+    if (!waiting_for_first_input && embd_inp.empty()) {
+        if (add_bos) {
+            embd_inp.push_back(llama_vocab_bos(vocab));
+            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
+        } else {
+            LOG_ERR("input is empty\n");
+            return -1;
+        }
+    }
+
+    // Tokenize negative prompt
+    if ((int) embd_inp.size() > n_ctx - 4) {
+        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        return 1;
+    }
+
+    bool session_do_save = false;
+
+    {
+        size_t n_match = 0;
+
+        if (!session_tokens.empty()) {
+            for (llama_token id : session_tokens) {
+                if (n_match >= embd_inp.size() || id != embd_inp[n_match]) {
+                    break;
+                }
+                n_match++;
+            }
+            if (params.prompt.empty() && n_match == embd_inp.size()) {
+                LOG_INF("%s: using full prompt from session file\n", __func__);
+            } else if (n_match >= embd_inp.size()) {
+                LOG_INF("%s: session file has exact match for prompt!\n", __func__);
+            } else if (n_match < (embd_inp.size() / 2)) {
+                LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                        __func__, n_match, embd_inp.size());
+            } else {
+                LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
+                        __func__, n_match, embd_inp.size());
+            }
+
+            if (session_tokens.size() == n_match) {
+                // [TAG_CONTEXT_STATE_LOGITS]
+                // in this case, we are going to reuse the logits from the session
+                // if we ever decide to remove the logits from the session, we need to handle this somehow
+                // ref: https://github.com/ggml-org/llama.cpp/pull/18862#issuecomment-3756330941
+            }
+
+            // remove any "future" tokens that we might have inherited from the previous session
+            if (session_tokens.size() > n_match) {
+                if (!llama_memory_seq_rm(mem, -1, n_match, -1)) {
+                    LOG_WRN("%s: unable to resuse common prefix (for example, when the memory is recurrent)\n", __func__);
+                    llama_memory_clear(mem, true);
+                    session_tokens.clear();
+                    n_match = 0;
+                } else {
+                    session_tokens.resize(n_match);
+                }
+            }
+        }
+
+        session_do_save = !path_session.empty() && n_match < embd_inp.size() && !params.prompt_cache_ro;
+    }
+
+    // number of tokens to keep when resetting context
+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
+        params.n_keep = (int)embd_inp.size();
+    } else {
+        params.n_keep += add_bos; // always keep the BOS token
+    }
+
+    if (params.conversation_mode) {
+        if (params.single_turn && !params.prompt.empty()) {
+            params.interactive = false;
+            params.interactive_first = false;
+        } else {
+            params.interactive_first = true;
+        }
+    }
+
+    // enable interactive mode if interactive start is specified
+    if (params.interactive_first) {
+        params.interactive = true;
+    }
+
+    if (params.verbose_prompt) {
+        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        for (int i = 0; i < (int) embd_inp.size(); i++) {
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
+        }
+
+        if (params.n_keep > add_bos) {
+            LOG_INF("%s: static prompt based on n_keep: '", __func__);
+            for (int i = 0; i < params.n_keep; i++) {
+                LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
+            }
+            LOG_CNT("'\n");
+        }
+        LOG_INF("\n");
+    }
+
+    // ctrl+C handling
+    {
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+        struct sigaction sigint_action;
+        sigint_action.sa_handler = sigint_handler;
+        sigemptyset (&sigint_action.sa_mask);
+        sigint_action.sa_flags = 0;
+        sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+        };
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+    }
+
+    if (params.interactive) {
+        LOG_INF("%s: interactive mode on.\n", __func__);
+
+        if (!params.antiprompt.empty()) {
+            for (const auto & antiprompt : params.antiprompt) {
+                LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
+                if (params.verbose_prompt) {
+                    auto tmp = common_tokenize(ctx, antiprompt, false, true);
+                    for (int i = 0; i < (int) tmp.size(); i++) {
+                        LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
+                    }
+                }
+            }
+        }
+
+        if (params.input_prefix_bos) {
+            LOG_INF("Input prefix with BOS\n");
+        }
+
+        if (!params.input_prefix.empty()) {
+            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
+            if (params.verbose_prompt) {
+                auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
+                }
+            }
+        }
+
+        if (!params.input_suffix.empty()) {
+            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
+            if (params.verbose_prompt) {
+                auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
+                }
+            }
+        }
+    }
+
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl));
+    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl).c_str());
+
+    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+
+    // group-attention state
+    // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
+    int ga_i = 0;
+
+    const int ga_n = params.grp_attn_n;
+    const int ga_w = params.grp_attn_w;
+
+    if (ga_n != 1) {
+        GGML_ASSERT(ga_n > 0                    && "grp_attn_n must be positive");                     // NOLINT
+        GGML_ASSERT(ga_w % ga_n == 0            && "grp_attn_w must be a multiple of grp_attn_n");     // NOLINT
+      //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
+      //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
+        LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
+    }
+    LOG_INF("\n");
+
+    if (params.interactive) {
+        const char * control_message;
+        if (params.multiline_input) {
+            control_message = " - To return control to the AI, end your input with '\\'.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n";
+        } else {
+            control_message = " - Press Return to return control to the AI.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n"
+                              " - If you want to submit another line, end your input with '\\'.\n";
+        }
+        LOG_INF("== Running in interactive mode. ==\n");
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
+#endif
+        LOG_INF(       "%s", control_message);
+        if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) {
+            LOG_INF(   " - Not using system message. To change it, set a different value via -sys PROMPT\n");
+        }
+        LOG_INF("\n");
+
+        is_interacting = params.interactive_first;
+    }
+
+    bool is_antiprompt = false;
+    bool input_echo    = true;
+    bool display       = true;
+
+    int n_past             = 0;
+    int n_remain           = params.n_predict;
+    int n_consumed         = 0;
+    int n_session_consumed = 0;
+
+    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
+    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
+    std::ostringstream output_ss;     g_output_ss     = &output_ss;
+    std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
+
+    // the first thing we will do is to output the prompt, so set color accordingly
+    console::set_display(DISPLAY_TYPE_PROMPT);
+    display = params.display_prompt;
+
+    std::vector<llama_token> embd;
+
+    // single-token antiprompts
+    std::vector<llama_token> antiprompt_token;
+
+    for (const std::string & antiprompt : params.antiprompt) {
+        auto ids = ::common_tokenize(ctx, antiprompt, false, true);
+        if (ids.size() == 1) {
+            antiprompt_token.push_back(ids[0]);
+        }
+    }
+
+    if (llama_model_has_encoder(model)) {
+        int enc_input_size = embd_inp.size();
+        llama_token * enc_input_buf = embd_inp.data();
+
+        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
+            LOG_ERR("%s : failed to eval\n", __func__);
+            return 1;
+        }
+
+        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+        if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
+            decoder_start_token_id = llama_vocab_bos(vocab);
+        }
+
+        embd_inp.clear();
+        embd_inp.push_back(decoder_start_token_id);
+    }
+
+    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
+        // predict
+        if (!embd.empty()) {
+            // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
+            // --prompt or --file which uses the same value.
+            int max_embd_size = n_ctx - 4;
+
+            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+            if ((int) embd.size() > max_embd_size) {
+                const int skipped_tokens = (int) embd.size() - max_embd_size;
+                embd.resize(max_embd_size);
+
+                console::set_display(DISPLAY_TYPE_ERROR);
+                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                console::set_display(DISPLAY_TYPE_RESET);
+            }
+
+            if (ga_n == 1) {
+                // infinite text generation via context shifting
+                // if we run out of context:
+                // - take the n_keep first tokens from the original prompt (via n_past)
+                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
+
+                if (n_past + (int) embd.size() >= n_ctx) {
+                    if (!params.ctx_shift){
+                        LOG_WRN("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
+                        break;
+                    }
+
+                    if (params.n_predict == -2) {
+                        LOG_WRN("\n\n%s: context full and n_predict == %d => stopping\n", __func__, params.n_predict);
+                        break;
+                    }
+
+                    const int n_left    = n_past - params.n_keep;
+                    const int n_discard = n_left/2;
+
+                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                            n_past, n_left, n_ctx, params.n_keep, n_discard);
+
+                    llama_memory_seq_rm (mem, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_memory_seq_add(mem, 0, params.n_keep + n_discard, n_past, -n_discard);
+
+                    n_past -= n_discard;
+
+                    LOG_DBG("after swap: n_past = %d\n", n_past);
+
+                    LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
+
+                    LOG_DBG("clear session path\n");
+                    path_session.clear();
+                }
+            } else {
+                // context extension via Self-Extend
+                while (n_past >= ga_i + ga_w) {
+                    const int ib = (ga_n*ga_i)/ga_w;
+                    const int bd = (ga_w/ga_n)*(ga_n - 1);
+                    const int dd = (ga_w/ga_n) - ib*bd - ga_w;
+
+                    LOG_DBG("\n");
+                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
+                    LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
+                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
+
+                    llama_memory_seq_add(mem, 0, ga_i,                n_past,              ib*bd);
+                    llama_memory_seq_div(mem, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_memory_seq_add(mem, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+
+                    n_past -= bd;
+
+                    ga_i += ga_w/ga_n;
+
+                    LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
+                }
+            }
+
+            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+            if (n_session_consumed < (int) session_tokens.size()) {
+                size_t i = 0;
+                for ( ; i < embd.size(); i++) {
+                    if (embd[i] != session_tokens[n_session_consumed]) {
+                        session_tokens.resize(n_session_consumed);
+                        break;
+                    }
+
+                    n_past++;
+                    n_session_consumed++;
+
+                    if (n_session_consumed >= (int) session_tokens.size()) {
+                        ++i;
+                        break;
+                    }
+                }
+                if (i > 0) {
+                    embd.erase(embd.begin(), embd.begin() + i);
+                }
+            }
+
+            if (!embd.empty()) {
+                int n_eval = (int) embd.size();
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
+
+                GGML_ASSERT(n_eval <= params.n_batch);
+                if (llama_decode(ctx, llama_batch_get_one(embd.data(), n_eval))) {
+                    LOG_ERR("%s : failed to eval\n", __func__);
+                    return 1;
+                }
+
+                n_past += n_eval;
+
+                LOG_DBG("n_past = %d\n", n_past);
+                // Display total tokens alongside total time
+                if (params.n_print > 0 && n_past % params.n_print == 0) {
+                    LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
+                }
+            }
+
+            if (!embd.empty() && !path_session.empty()) {
+                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
+                n_session_consumed = session_tokens.size();
+            }
+        }
+
+        embd.clear();
+
+        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
+            // optionally save the session on first sample (for faster prompt loading next time)
+            if (session_do_save) {
+                session_do_save = false;
+                llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+
+                LOG_DBG("saved session to %s\n", path_session.c_str());
+            }
+
+            const llama_token id = common_sampler_sample(smpl, ctx, -1);
+
+            common_sampler_accept(smpl, id, /* accept_grammar= */ true);
+
+            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
+
+            embd.push_back(id);
+
+            if (params.conversation_mode && !waiting_for_first_input && !llama_vocab_is_eog(vocab, id)) {
+                assistant_ss << common_token_to_piece(ctx, id, false);
+            }
+
+            // echo this to console
+            input_echo = true;
+
+            // decrement remaining sampling budget
+            --n_remain;
+
+            LOG_DBG("n_remain: %d\n", n_remain);
+        } else {
+            // some user input remains from prompt or interaction, forward it to processing
+            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            while ((int) embd_inp.size() > n_consumed) {
+                embd.push_back(embd_inp[n_consumed]);
+
+                // push the prompt in the sampling context in order to apply repetition penalties later
+                // for the prompt, we don't apply grammar rules
+                common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
+
+                ++n_consumed;
+                if ((int) embd.size() == params.n_batch) {
+                    break;
+                }
+            }
+        }
+
+        // display text
+        if (input_echo && display) {
+            for (auto id : embd) {
+                const std::string token_str = common_token_to_piece(ctx, id, params.special);
+
+                // Console/Stream Output
+                LOG("%s", token_str.c_str());
+
+                // Record Displayed Tokens To Log
+                // Note: Generated tokens are created one by one hence this check
+                if (embd.size() > 1) {
+                    // Incoming Requested Tokens
+                    input_tokens.push_back(id);
+                } else {
+                    // Outgoing Generated Tokens
+                    output_tokens.push_back(id);
+                    output_ss << token_str;
+                }
+            }
+        }
+
+        // reset color to default if there is no pending user input
+        if (input_echo && (int) embd_inp.size() == n_consumed) {
+            console::set_display(DISPLAY_TYPE_RESET);
+            display = true;
+        }
+
+        // if not currently processing queued inputs;
+        if ((int) embd_inp.size() <= n_consumed) {
+            // check for reverse prompt in the last n_prev tokens
+            if (!params.antiprompt.empty()) {
+                const int n_prev = 32;
+                const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
+
+                is_antiprompt = false;
+                // Check if each of the reverse prompts appears at the end of the output.
+                // If we're not running interactively, the reverse prompt might be tokenized with some following characters
+                // so we'll compensate for that by widening the search window a bit.
+                for (std::string & antiprompt : params.antiprompt) {
+                    size_t extra_padding = params.interactive ? 0 : 2;
+                    size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
+                        ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
+                        : 0;
+
+                    if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
+                        if (params.interactive) {
+                            is_interacting = true;
+                        }
+                        is_antiprompt = true;
+                        break;
+                    }
+                }
+
+                // check for reverse prompt using special tokens
+                // avoid calling common_sampler_last() if last_output is empty
+                if (!last_output.empty()) {
+                    llama_token last_token = common_sampler_last(smpl);
+                    for (auto token : antiprompt_token) {
+                        if (token == last_token) {
+                            if (params.interactive) {
+                                is_interacting = true;
+                            }
+                            is_antiprompt = true;
+                            break;
+                        }
+                    }
+                }
+
+                if (is_antiprompt) {
+                    LOG_DBG("found antiprompt: %s\n", last_output.c_str());
+                }
+            }
+
+            // deal with end of generation tokens in interactive mode
+            if (!waiting_for_first_input && llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
+                LOG_DBG("found an EOG token\n");
+
+                if (params.interactive) {
+                    if (!params.antiprompt.empty()) {
+                        // tokenize and inject first reverse prompt
+                        const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
+                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                        is_antiprompt = true;
+                    }
+
+                    if (params.enable_chat_template) {
+                        chat_add_and_format("assistant", assistant_ss.str());
+                    }
+                    is_interacting = true;
+                    LOG("\n");
+                }
+            }
+
+            if (params.conversation_mode && !waiting_for_first_input) {
+                if (!prompt.empty()) {
+                    prompt.clear();
+                    is_interacting = false;
+                }
+            }
+
+            if ((n_past > 0 || waiting_for_first_input) && is_interacting) {
+                LOG_DBG("waiting for user input\n");
+
+                if (params.conversation_mode) {
+                    LOG("\n> ");
+                }
+
+                if (params.input_prefix_bos) {
+                    LOG_DBG("adding input prefix BOS token\n");
+                    embd_inp.push_back(llama_vocab_bos(vocab));
+                }
+
+                std::string buffer;
+                if (!params.input_prefix.empty() && !params.conversation_mode) {
+                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    LOG("%s", params.input_prefix.c_str());
+                }
+
+                // color user input only
+                console::set_display(DISPLAY_TYPE_USER_INPUT);
+                display = params.display_prompt;
+
+                std::string line;
+                bool another_line = true;
+                do {
+                    another_line = console::readline(line, params.multiline_input);
+                    buffer += line;
+                } while (another_line);
+
+                // done taking input, reset color
+                console::set_display(DISPLAY_TYPE_RESET);
+                display = true;
+
+                if (buffer.empty()) { // Ctrl+D on empty line exits
+                    LOG("EOF by user\n");
+                    break;
+                }
+
+                if (buffer.back() == '\n') {
+                    // Implement #587:
+                    // If the user wants the text to end in a newline,
+                    // this should be accomplished by explicitly adding a newline by using \ followed by return,
+                    // then returning control by pressing return again.
+                    buffer.pop_back();
+                }
+
+                if (buffer.empty()) { // Enter key on empty line lets the user pass control back
+                    LOG_DBG("empty line, passing control back\n");
+                } else { // Add tokens to embd only if the input buffer is non-empty
+                    // append input suffix if any
+                    if (!params.input_suffix.empty() && !params.conversation_mode) {
+                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        LOG("%s", params.input_suffix.c_str());
+                    }
+
+                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
+
+                    const size_t original_size = embd_inp.size();
+
+                    if (params.escape) {
+                        string_process_escapes(buffer);
+                    }
+
+                    bool format_chat = params.conversation_mode && params.enable_chat_template;
+                    std::string user_inp = format_chat
+                        ? chat_add_and_format("user", std::move(buffer))
+                        : std::move(buffer);
+                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
+                    const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
+                    const auto line_inp = common_tokenize(ctx, user_inp,            false, format_chat);
+                    const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
+
+                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
+
+                    // if user stop generation mid-way, we must add EOT to finish model's last response
+                    if (need_insert_eot && format_chat) {
+                        llama_token eot = llama_vocab_eot(vocab);
+                        embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot);
+                        need_insert_eot = false;
+                    }
+
+                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
+                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
+
+                    if (params.verbose_prompt) {
+                        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size() - original_size);
+                    }
+
+                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
+                        const llama_token token = embd_inp[i];
+                        const std::string token_str = common_token_to_piece(ctx, token);
+                        output_tokens.push_back(token);
+                        output_ss << token_str;
+
+                        if (params.verbose_prompt) {
+                            LOG_INF("%6d -> '%s'\n", token, token_str.c_str());
+                        }
+                    }
+
+                    // reset assistant message
+                    assistant_ss.str("");
+
+                    n_remain -= line_inp.size();
+                    LOG_DBG("n_remain: %d\n", n_remain);
+                }
+
+                input_echo = false; // do not echo this again
+            }
+
+            if (n_past > 0 || waiting_for_first_input) {
+                if (is_interacting) {
+                    common_sampler_reset(smpl);
+                }
+                is_interacting = false;
+
+                if (waiting_for_first_input && params.single_turn) {
+                    params.interactive = false;
+                    params.interactive_first = false;
+                }
+                waiting_for_first_input = false;
+            }
+        }
+
+        // end of generation
+        if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) {
+            LOG(" [end of text]\n");
+            break;
+        }
+
+        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
+        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
+        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
+            n_remain = params.n_predict;
+            is_interacting = true;
+        }
+    }
+
+    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
+        LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+    }
+
+    LOG("\n\n");
+    common_perf_print(ctx, smpl);
+
+    llama_backend_free();
+
+    ggml_threadpool_free_fn(threadpool);
+    ggml_threadpool_free_fn(threadpool_batch);
+
+    return 0;
+}
diff --git a/llama.cpp/tools/cvector-generator/CMakeLists.txt b/llama.cpp/tools/cvector-generator/CMakeLists.txt
new file mode 100644
index 0000000..baeb4d0
--- /dev/null
+++ b/llama.cpp/tools/cvector-generator/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-cvector-generator)
+add_executable(${TARGET} cvector-generator.cpp pca.hpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/llama.cpp/tools/cvector-generator/README.md b/llama.cpp/tools/cvector-generator/README.md
new file mode 100644
index 0000000..6d5fd74
--- /dev/null
+++ b/llama.cpp/tools/cvector-generator/README.md
@@ -0,0 +1,45 @@
+# cvector-generator
+
+This example demonstrates how to generate a control vector using gguf models.
+
+Related PRs:
+- [Add support for control vectors](https://github.com/ggml-org/llama.cpp/pull/5970)
+- (Issue) [Generate control vector using llama.cpp](https://github.com/ggml-org/llama.cpp/issues/6880)
+- [Add cvector-generator example](https://github.com/ggml-org/llama.cpp/pull/7514)
+
+## Examples
+
+```sh
+# CPU only
+./cvector-generator -m ./llama-3.Q4_K_M.gguf
+
+# With GPU
+./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
+
+# With advanced options
+./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
+
+# Using mean value instead of PCA
+./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
+
+# To see help message
+./cvector-generator -h
+# Then, have a look at "cvector" section
+```
+
+## Tips and tricks
+
+If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example:
+
+```
+<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
+<|im_start|>system\nYou are in a very good mood today<|im_end|>
+```
+
+Example to use output file with `llama-cli`:
+
+(Tips: The control vector works better when apply to layers higher than 10)
+
+```sh
+./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
+```
diff --git a/llama.cpp/tools/cvector-generator/completions.txt b/llama.cpp/tools/cvector-generator/completions.txt
new file mode 100644
index 0000000..abc45ff
--- /dev/null
+++ b/llama.cpp/tools/cvector-generator/completions.txt
@@ -0,0 +1,582 @@
+
+That game
+I can see
+Hmm, this
+I can relate to
+Who is
+I understand the
+Ugh,
+What the hell was
+Hey, did anyone
+Although
+Thank you for choosing
+What are you
+Oh w
+How dare you open
+It was my pleasure
+I'm hon
+I appreciate that you
+Are you k
+Whoever left this
+It's always
+Ew,
+Hey, I l
+Hello? Is someone
+I understand that
+That poem
+Aww, poor
+Hey, it
+Alright, who
+I didn't
+Well, life
+The document
+Oh no, this
+I'm concerned
+Hello, this is
+This art
+Hmm, this drink
+Hi there!
+It seems
+Is
+Good
+I can't
+Ex
+Who are
+I can see that
+Wow,
+Today is a
+Hey friend
+Sometimes friends
+Oh, this old
+The weather outside
+This place is sur
+I appreciate your input
+Thank you for the
+Look at
+I'm disappoint
+To my
+How dare you
+That's an
+This piece of art
+Eww
+This park is
+This is incredible
+Oh no, someone
+Exc
+Well, it'
+I warned
+Hey, I understand
+Hey, I saw
+How dare you go
+What the he
+Hey
+It's
+Hello? Hello?
+It
+Oh no!
+This is the perfect
+Good morning,
+Oh no, there
+It's so
+Yeah
+Uh,
+Hello everyone
+Who turned off
+The weather
+Who'
+Hey, this
+Wait,
+Eww, gross
+Excuse
+It seems like you
+Thank you so
+What happened?
+Oh my g
+I am deeply sad
+I war
+Okay, let'
+Hey, that
+That was a beautiful
+Oh no! That
+What happened
+Hey there
+The artist'
+What?!
+Hey, it'
+I am disappoint
+It seems like
+Oh no! The
+This park is a
+If you
+Yes! I did
+It sounds
+What
+Who is it
+Hmm, that
+That's strange
+Yeah, that was
+That's interesting
+This park
+What the hell
+Who is that
+I feel like my
+Oh well
+What the hell is
+Hello? Hello
+To my dearest
+Bless you!\"
+Thank you for
+Oh, looks like
+Can you please
+This place is
+Eww, what
+Bless you
+Is everything
+Hey, I just
+Whoever left these
+Well, that'
+I feel
+Hey, do you
+It's sad
+Oh no, it
+Hey, that'
+Oh my god,
+Thank you,
+Hello little one,
+I apolog
+Hey team, I
+How dare you read
+Who is this and
+Whoever left
+Hi there! W
+A
+If you have
+I was
+U
+Bless
+Well, this
+Oh, I'
+It's a
+Eww,
+Is everything okay?
+Oh, I
+Hello, can you
+Al
+That was a great
+What are
+I understand that not
+Oh no, not
+Who is it?\"
+Hey, can we
+Whoever is taking
+I would love to
+Hey, I noticed
+Hey, could
+I understand that there
+Hello?
+D
+Oh man, I
+Thank you so much
+Oh no, my
+Dear [Name
+Uh
+I remember
+Hey, who
+Well, it
+Are you
+I understand that it
+Hey, is
+I would
+Who is this
+Excuse me
+Alright
+I am thrilled
+Sometimes friends have
+Who the
+It's interesting
+I would love
+E
+Hello? Is anyone
+Well, this is
+This place
+Well,
+I warned you
+Hey, watch where
+Oh my
+That'
+Sometimes friends have different
+I understand that everyone
+What?
+What do these notes
+I can relate
+I'm not
+I understand
+To my dear
+Guys
+Well
+Hey, I appreciate
+Wow, what
+Dear
+That melody
+Who the hell
+Today is
+Hello little
+Wow, look
+That's great
+Love is never wrong
+I'm having
+Whoa, did
+Ugh
+Can you please provide
+I miss you,
+I feel uncom
+I know
+Ugh, this
+Hey, watch
+Oh great, a
+I didn
+Okay
+That game of char
+Oh
+I appreciate
+Who's there
+I am so
+Oh great, someone
+Hey, could you
+I remember wondering
+Wait, what?
+What do
+Hello? Can
+Hey there,
+That game of
+This is incred
+Oh my gosh
+Oh great, f
+I appreciate your
+It sounds like
+What the heck
+Okay, I understand
+Ew
+I understand that this
+Uh, hi
+Hi everyone!
+What the hell?
+Thank you for your
+Oh no, the
+Wow, I
+Who turned
+Dear [
+Whoever
+This is a
+Whoa, he
+What in the world
+Although the physical
+Hello, who is
+That's amaz
+Hey, I know
+Okay, that
+Hi everyone
+Hey, is everything
+I understand your fr
+Oh no, poor
+Oh, look
+Good morning
+Ew, gross
+Oh no, did
+Look at the family
+Hey team
+Yes!
+Hey, can I
+Okay, that'
+It's great
+Love is
+Hey, what
+Good morning, world
+Who is it?
+That poem really reson
+I
+That's
+I understand the task
+Gu
+Hello? Who'
+This postcard is
+Whoa,
+Oh, that
+I understand that I
+Whoever is
+Hello? Who is
+I'm really
+Wow, this
+Can
+This artwork really
+This is a shame
+I miss you too
+Who are you?
+Today is a difficult
+Hey, just
+Are you okay
+I am
+Hi,
+Wow, that
+Hey there! Can
+Okay, stay
+Oh great, just
+Yeah,
+Hello? Can you
+Oh, looks
+Thank you for sharing
+I'm glad
+Hey, is that
+Hmm
+It was my
+It sounds like you
+Wow, your
+I was promised certain
+That was such a
+Thank
+Excuse you
+That was
+Hey team,
+I feel un
+It was
+What'
+Hey friend, I
+How
+Saying goodbye
+That
+It's heart
+How dare
+Oh,
+Hello, may
+What's this
+Thank you for recogn
+Aww, that
+Oh, I remember
+Hmm, that'
+I miss
+I know this
+Wait
+Is everything okay
+Who is that person
+Wow, you
+Oh great
+I'm sad
+Wow, the
+I am very disappoint
+Who turned off the
+I understand that things
+I'm very
+Hi
+That's very
+Okay, I
+Oh no,
+Wow, there
+What's wrong
+I apologize for
+Hey, I
+Can I help you
+Oh, I didn
+Alright,
+Oh wow,
+Oh my goodness
+I know this event
+What in the
+Saying
+Yeah, that
+Guys, I
+Hey, this v
+This post
+Are
+Hey, can
+Hello? Is
+I can only imagine
+Oh, that sounds
+Hey, is anyone
+I am disappointed
+Hello,
+Hey everyone, I
+That was such
+It's okay
+The artist
+Whoa
+I understand that mistakes
+Can I help
+Who
+Hi everyone! I
+Hey, can you
+Wow, how
+Today
+Oh no, I
+Oh well, I
+Well, that
+This is the
+Yes! I finally
+Hey there little
+Hello everyone!
+Love is never
+Look at the
+This postcard
+Oh great,
+Can I
+Hmm, this is
+I understand your
+Oh, look at
+B
+I'm so
+Whoa, this
+W
+Oh, this
+Sometimes
+This piece of
+What the
+That was a
+Hey, do
+Oh no
+Whoa, what
+I feel like I
+The documentary
+Hello
+Hello little one
+I understand that my
+Eww, that
+Wow, an
+Yes! Finally,
+Although the physical location
+Whoever is watching
+That movie
+I remember wondering about
+Hey there, little
+Who's
+Hello, who
+Hello everyone! Thank
+Hello, can
+That's too
+Hey, just wanted
+Hey there, I
+Saying good
+Hey there!
+Who is there?
+Oh my good
+I am very
+Oh no, what
+Wow, thank
+I was promised
+Hi, is
+Hey, I'
+Guys, the
+Oh no, that
+Who is there
+Hello, this
+That movie really touched
+If you have something
+The documentary was
+I'm starting
+Are you kidd
+That movie really
+Hey everyone,
+Thank you for considering
+I didn'
+Yes! I
+Can you
+Oh my god
+Hey, whoever
+That melody really
+Thank you, little
+Hello, may I
+Look
+Wow, we
+It looks
+What do these
+Oh wow
+I apologize
+What are you all
+It's such
+It's clear
+Hey, I was
+Hey friend,
+I can only
+The weather outside is
+Eww, this
+I miss you
+Wow
+Aww,
+Hi, is there
+This artwork
+Okay,
+Oh well,
+This
+I'
+Say
+Hey there little gu
+Hmm,
+Whoa, who
+I am thr
+Oh man
+Okay, stay calm
+I'm happy
+Oh, this cur
+Oh man,
+I'm sorry
+Hello? Who
+What?! That
+This piece
+Hey everyone
+That's so
+Are you okay?
+What happened? Where
+Hi there
+The
+Who the hell entered
+I can
+Guys,
+What's
+What in
+It's important
+I'm
+I'm coming
+It'
+Yes! Finally
+Wait, what
+Wow, reading
+I'm surprised
+Hey, did
+Hey,
+Okay, let
+I understand that you
+Who the hell threw
+Eww, who
+Thank you for thinking
+Who is this?\"
+I am deeply
+Thank you for including
+Oh no, an
+It looks like you
+Aww
+I'm confused
+Wow, it
+That poem really
+Yes
+Hey there, is
+Hey, what'
+Thank you for remember
+To
+This is
+Thank you for making
+I can'
+That mel
+Wow, they
+I feel like
+Although the
+Who are you
+Love
+If
+What the hell are
+I am so sad
+Oh, I found
+Thank you
+It looks like
+Well, life is
+I appreciate that
+The artist's
+Whoa, that
+It's never
\ No newline at end of file
diff --git a/llama.cpp/tools/cvector-generator/cvector-generator.cpp b/llama.cpp/tools/cvector-generator/cvector-generator.cpp
new file mode 100644
index 0000000..3ba7c52
--- /dev/null
+++ b/llama.cpp/tools/cvector-generator/cvector-generator.cpp
@@ -0,0 +1,508 @@
+#include "ggml.h"
+#include "gguf.h"
+
+#include "arg.h"
+#include "common.h"
+#include "llama.h"
+#include "pca.hpp"
+#include "mean.hpp"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+
+//////////////////////////////////////////////////
+// utils
+
+template <class Iter>
+static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
+    std::string ret;
+    for (; begin != end; ++begin) {
+        ret += common_token_to_piece(ctx, *begin);
+    }
+
+    return ret;
+}
+
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
+    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
+    printf("\n    advanced:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
+    printf("\n    using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
+    printf("\n");
+}
+
+//////////////////////////////////////////////////
+
+
+// cb_eval is reused for each pair of positive - negative prompt
+struct callback_data {
+    ggml_context * ctx_ggml = nullptr;   // holds v_pos, v_neg, v_diff_filtered
+
+    int n_layers = 0;
+    int n_tokens = 0;
+    bool is_eval_pos = true;
+
+    // each element of the vector correspond to one layer
+    std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
+    std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
+    std::vector<struct ggml_tensor *> v_diff_filtered;   // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
+
+    // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
+    void save_tensor_for_layer(struct ggml_tensor * t) {
+        GGML_ASSERT(t->type == GGML_TYPE_F32);
+
+        if (ctx_ggml == nullptr) {
+            // alloc a new ctx_ggml if needed
+            struct ggml_init_params params_ggml = {
+                /*.mem_size   =*/ ggml_tensor_overhead() * n_layers * 3u,
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+            ctx_ggml = ggml_init(params_ggml);
+        }
+
+        // copy tensor data
+        auto n_bytes = ggml_nbytes(t);
+        struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
+        t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
+        ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
+        ggml_set_name(t_layer, ggml_get_name(t));
+        //print_debug_tensor(t_layer);
+
+        if (is_eval_pos) {
+            v_pos.push_back(t_layer);
+        } else {
+            v_neg.push_back(t_layer);
+        }
+    }
+
+    // calculate diff (v_pos - v_neg) and place the result back to v_pos
+    // all zero rows in the diff tensor will also be removed
+    // NOTE: final layer is ignored. we only have (n_layers - 1) to process
+    std::vector<struct ggml_tensor *> calc_diff() {
+        for (float il = 0; il < v_pos.size(); il++) {
+            float * a = (float *) v_pos[il]->data;
+            float * b = (float *) v_neg[il]->data;
+            size_t n_elem = ggml_nelements(v_pos[il]);
+            for (size_t j = 0; j < n_elem; j++) {
+                a[j] -= b[j];
+            }
+            //print_debug_tensor(v_pos[i]);
+            auto diff_filtered = filter_nonzero_rows(v_pos[il]);
+            v_diff_filtered.push_back(diff_filtered);
+        }
+        return v_diff_filtered; // for convinient, we return the result std::vector
+    }
+
+    // delete zero rows from a given 2D tensor
+    struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
+        //printf("filter_nonzero_rows\n");
+        auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
+            // check if given row containing all zero elements
+            int n_cols = t->ne[0]; // hint: should be equal to n_embd
+            for (int col = 0; col < n_cols; ++col) {
+                if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
+                    return false;
+                }
+            }
+            return true;
+        };
+        std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
+        for (int i_row = 0; i_row < a->ne[1]; i_row++) {
+            if (!is_row_all_zeros(a, i_row, 1e-6)) {
+                rows_to_copy.push_back(i_row);
+            }
+        }
+
+        // get "n_nonzero_rows" for the output "diff_filtered"
+        int n_nonzero_rows = rows_to_copy.size();
+        //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
+        int n_embd = a->ne[0];
+        GGML_ASSERT(n_nonzero_rows > 0);
+
+        // diff_filtered: [n_embd, n_nonzero_rows]
+        struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
+            ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
+        ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
+        diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
+
+        // copy non-zero rows
+        for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
+            int src_row = rows_to_copy[dest_row];
+            for (int i = 0; i < n_embd; i++) {
+                float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
+                ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
+            }
+        }
+
+        //print_debug_tensor(diff_filtered);
+
+        return diff_filtered;
+    }
+
+    // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
+    void reset() {
+        for (auto ptr : v_pos) free(ptr->data);
+        for (auto ptr : v_neg) free(ptr->data);
+        for (auto ptr : v_diff_filtered) free(ptr->data);
+        v_pos.clear();
+        v_neg.clear();
+        v_diff_filtered.clear();
+        if (ctx_ggml) {
+            ggml_free(ctx_ggml);
+        }
+        ctx_ggml = nullptr;
+    }
+};
+
+/**
+ * process_ctx is used to store the ggml context for pre-post processing the diff vectors
+ * in short, input => v_diff and output => v_final
+ */
+struct train_context {
+    ggml_context * ctx_ggml;
+    int n_embd;
+    int n_layers;
+
+    /* pair of prompts to be used for generating final vector */
+    std::vector<std::string> positive_entries;
+    std::vector<std::string> negative_entries;
+
+    // each element of the vector correspond to one layer
+    // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
+    // NOTE (2): v_diff is transposed from v_diff_tmp
+    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
+    std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
+
+    // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
+    // v_diff_tmp will get converted unto v_diff later on
+    std::vector<std::vector<uint8_t>> v_diff_tmp;
+
+    train_context(int n_embd_, int n_layers_) {
+        n_embd = n_embd_;
+        n_layers = n_layers_;
+        struct ggml_init_params params_ggml = {
+            /*.mem_size   =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_ggml = ggml_init(params_ggml);
+        for (int il = 0; il < n_layers - 1; il++) {
+            std::vector<uint8_t> empty;
+            v_diff_tmp.push_back(empty);
+            auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
+            t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
+            v_final.push_back(t);
+        }
+    }
+
+    // add new rows into existing tensor in v_diff_tmp
+    void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
+        GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
+        for (int il = 0; il < n_layers - 1; il++) {
+            auto t = diff_filtered[il];
+            auto & diff_tmp = v_diff_tmp[il];
+            size_t curr_size = diff_tmp.size();
+            diff_tmp.resize(curr_size + ggml_nbytes(t));
+            memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
+        }
+    }
+
+    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
+    // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
+    void build_v_diff(bool transpose) {
+        printf("build_v_diff\n");
+        for (int il = 0; il < n_layers - 1; il++) {
+            auto & diff_tmp = v_diff_tmp[il];
+            int n_elem = diff_tmp.size() / sizeof(float);
+            GGML_ASSERT(n_elem % n_embd == 0);
+            int n_rows = n_elem / n_embd;
+            struct ggml_tensor * diff = transpose
+                ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
+                : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
+            ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
+            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
+            if (transpose) {
+                // copy data & transpose
+                float * arr = (float *) diff_tmp.data();
+                for (int ir = 0; ir < n_rows; ++ir) {
+                    for (int ic = 0; ic < n_embd; ++ic) {
+                        float f = arr[ir*n_embd + ic];
+                        ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
+                    }
+                }
+            } else {
+                // only copy
+                memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
+            }
+            v_diff.push_back(diff);
+            print_debug_tensor(diff);
+            // free memory of diff_tmp
+            diff_tmp.resize(0);
+        }
+    }
+
+    ~train_context() {
+        for (auto ptr : v_final) free(ptr->data);
+        for (auto ptr : v_diff) free(ptr->data);
+        // no need to free v_diff_tmp, since we didn't use malloc
+        ggml_free(ctx_ggml);
+    }
+};
+
+struct tokenized_prompt {
+    std::vector<llama_token> tokens_pos;
+    std::vector<llama_token> tokens_neg;
+    size_t max_seq_len;
+
+    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
+        const llama_model * model = llama_get_model(ctx);
+        const llama_vocab * vocab = llama_model_get_vocab(model);
+        const bool add_bos = llama_vocab_get_add_bos(vocab);
+        tokens_pos = common_tokenize(ctx, pos, add_bos, true);
+        tokens_neg = common_tokenize(ctx, neg, add_bos, true);
+        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
+        padding_seq(ctx, tokens_pos, max_seq_len);
+        padding_seq(ctx, tokens_neg, max_seq_len);
+    }
+
+    void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
+        // TODO: customize padding token
+        std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
+        llama_token pad_tok = pad_tokens.back();
+        while (tokens.size() < len) {
+            tokens.push_back(pad_tok);
+        }
+    }
+};
+
+//////////////////////////////////////////////////
+
+template <typename T>
+static std::string to_string(const T & val) {
+    std::stringstream ss;
+    ss << val;
+    return ss.str();
+}
+
+static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
+    std::vector<std::string> output;
+    std::ifstream file(path);
+    if (!file.is_open()) {
+        fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
+        exit(1);
+    }
+    std::string line;
+    while (std::getline(file, line)) {
+        bool is_skip = skip_empty_lines && line.empty();
+        if (!is_skip) {
+            string_process_escapes(line);
+            output.push_back(line);
+        }
+    }
+    file.close();
+    return output;
+}
+
+//////////////////////////////////////////////////
+
+static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (callback_data *) user_data;
+    static const char * l_out_name = "l_out";
+    const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
+
+    if (ask) {
+        return is_l_out;
+    }
+
+    if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
+        return true;
+    }
+
+    // save the tensor to current context
+    cb_data->save_tensor_for_layer(t);
+    return true;
+}
+
+static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
+    llama_memory_clear(llama_get_memory(ctx), true);
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
+        fprintf(stderr, "%s : failed to eval\n", __func__);
+        return false;
+    }
+    return true;
+}
+
+static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
+    struct gguf_context * ctx = gguf_init_empty();
+
+    const std::string arch = "controlvector";
+    gguf_set_val_str(ctx, "general.architecture", arch.c_str());
+    gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
+    gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
+
+    for (size_t i = 0; i < v_ctrl.size(); ++i) {
+        gguf_add_tensor(ctx, v_ctrl[i]);
+        print_debug_tensor(v_ctrl[i]);
+        printf("Added tensor: %s\n", v_ctrl[i]->name);
+    }
+
+    printf("%s: writing file...\n", __func__);
+    gguf_write_to_file(ctx, fname.c_str(), false);
+    printf("%s: wrote file '%s'\n", __func__, fname.c_str());
+    gguf_free(ctx);
+}
+
+/**
+ * Load prompt files and completion file.
+ * Then format each pair of prompt + completion to make an entry.
+ */
+static int prepare_entries(common_params & params, train_context & ctx_train) {
+    // load prompts
+    std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
+    std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
+    if (positive_prompts.size() != negative_prompts.size()) {
+        fprintf(stderr, "number of positive and negative prompts must be equal\n");
+        return 1;
+    }
+    if (positive_prompts.empty()) {
+        fprintf(stderr, "must provide at least one prompt pair\n");
+        return 1;
+    }
+    ctx_train.positive_entries = positive_prompts;
+    ctx_train.negative_entries = negative_prompts;
+    return 0;
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.out_file = "control_vector.gguf";
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
+        return 1;
+    }
+
+    if (params.n_pca_iterations % params.n_pca_batch != 0) {
+        fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
+        return 1;
+    }
+
+
+    callback_data cb_data;
+
+    // pass the callback to the backend scheduler
+    // it will be executed for each node during the graph computation
+    params.cb_eval = cb_eval;
+    params.cb_eval_user_data = &cb_data;
+    params.warmup = false;
+
+    print_build_info();
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // load the model to get hparams
+    auto llama_init = common_init_from_params(params);
+
+    auto * model = llama_init->model();
+    auto * ctx   = llama_init->context();
+
+    // int n_ctx = llama_n_ctx(ctx);
+    int n_layers = llama_model_n_layer(model);
+    int n_embd = llama_model_n_embd(model);
+
+    // get model hint param (a.k.a model arch name)
+    char model_hint[128];
+    llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
+
+    // init train_context
+    train_context ctx_train(n_embd, n_layers);
+
+    // load and prepare entries for training
+    prepare_entries(params, ctx_train);
+
+    // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
+    std::vector<tokenized_prompt> tokenized_prompts;
+    size_t n_total_tokens = 0;
+    for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
+        tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
+        n_total_tokens += 2 * t.max_seq_len;
+        tokenized_prompts.push_back(std::move(t));
+    }
+
+    std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
+
+    for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
+        bool success = false;
+        tokenized_prompt t = tokenized_prompts[i];
+        cb_data.n_layers = n_layers;
+        cb_data.n_tokens = t.max_seq_len;
+
+        printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
+            (int) i+1, (int) ctx_train.positive_entries.size(),
+            tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
+            tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
+            (int) t.max_seq_len);
+
+        cb_data.is_eval_pos = true;
+        success = get_hidden_layers(ctx, t.tokens_pos);
+        if (!success) break;
+
+        cb_data.is_eval_pos = false;
+        success = get_hidden_layers(ctx, t.tokens_neg);
+        if (!success) break;
+
+        // calculate diff and remove all zero rows
+        auto v_diff_filtered = cb_data.calc_diff();
+
+        // save & concat the filtered v_diff to ctx_train
+        ctx_train.concat_diff_tmp(v_diff_filtered);
+
+        // reset for next iteration
+        cb_data.reset();
+    }
+
+    // done with the model, we can now free it to make gain some memory
+    printf("Done evaluate prompts, unload model...\n");
+
+    bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
+
+    // prepare ctx_train for PCA
+    ctx_train.build_v_diff(use_pca);
+
+    if (use_pca) {
+        // run PCA
+        PCA::pca_params pca_params;
+        pca_params.n_threads    = params.cpuparams.n_threads;
+        pca_params.n_batch      = params.n_pca_batch;
+        pca_params.n_iterations = params.n_pca_iterations;
+        PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
+    } else {
+        // run mean
+        mean::run(ctx_train.v_diff, ctx_train.v_final);
+    }
+
+    // write output vectors to gguf
+    export_gguf(ctx_train.v_final, params.out_file, model_hint);
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/llama.cpp/tools/cvector-generator/mean.hpp b/llama.cpp/tools/cvector-generator/mean.hpp
new file mode 100644
index 0000000..4eeac1e
--- /dev/null
+++ b/llama.cpp/tools/cvector-generator/mean.hpp
@@ -0,0 +1,48 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include <string>
+#include <vector>
+#include <math.h>
+
+namespace mean {
+
+static void run(
+        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
+        const std::vector<struct ggml_tensor *> & v_output) {
+    printf("%s: Running mean...\n", __func__);
+    for (size_t il = 0; il < v_input.size(); ++il) {
+        // prepare output vector
+        struct ggml_tensor * ctrl_out = v_output[il];
+        ggml_format_name(ctrl_out, "direction.%zu", il+1);
+
+        // calculate mean vector
+        struct ggml_tensor * t_layer = v_input[il];
+        GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
+        for (int ic = 0; ic < t_layer->ne[0]; ic++) {
+            float f = 0.0;
+            for (int ir = 0; ir < t_layer->ne[1]; ir++) {
+                f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
+            }
+            f /= t_layer->ne[1];
+            ggml_set_f32_1d(ctrl_out, ic, f);
+        }
+
+        // normalize output vector
+        float norm = 0.0;
+        for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
+            float f = ggml_get_f32_1d(ctrl_out, i);
+            norm += f*f;
+        }
+        norm = sqrt(norm);
+        for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
+            float f = ggml_get_f32_1d(ctrl_out, i);
+            ggml_set_f32_1d(ctrl_out, i, f / norm);
+        }
+
+        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
+    }
+}
+
+}
diff --git a/llama.cpp/tools/cvector-generator/negative.txt b/llama.cpp/tools/cvector-generator/negative.txt
new file mode 100644
index 0000000..45b9384
--- /dev/null
+++ b/llama.cpp/tools/cvector-generator/negative.txt
@@ -0,0 +1,4 @@
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
+<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
+<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
\ No newline at end of file
diff --git a/llama.cpp/tools/cvector-generator/pca.hpp b/llama.cpp/tools/cvector-generator/pca.hpp
new file mode 100644
index 0000000..afd3bf6
--- /dev/null
+++ b/llama.cpp/tools/cvector-generator/pca.hpp
@@ -0,0 +1,315 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include <cstdio>
+#include <ctime>
+#include <random>
+#include <string>
+#include <vector>
+
+#define DEBUG_POS 5
+
+static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
+    printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
+    if (!with_data) return;
+    printf("%s: %s[0] = [", __func__, t->name);
+    for (size_t i = 0; i <= DEBUG_POS; i++) {
+        printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
+    }
+    printf(" ... ]\n");
+}
+
+namespace PCA {
+
+// input params for PCA computations
+struct pca_params {
+    int n_threads = 1;
+    int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
+    int n_iterations = 1000;
+    float tolerance = 1e-7;
+
+    // for debugging
+    int i_layer = 0;
+    int n_layers = 0;
+};
+
+// result from each iteration
+struct pca_result {
+    struct ggml_tensor * calculated_square = NULL;
+    std::vector<struct ggml_tensor *> eigenvectors;
+    std::vector<float> distances;
+};
+
+struct pca_model {
+    ggml_backend_t backend = NULL;
+    ggml_backend_buffer_t buffer;
+    struct ggml_context * ctx;      // context to compute graph on target device
+    struct ggml_context * ctx_host; // host context to store results
+
+    // tensors on target device
+    struct ggml_tensor * dev_input;
+    struct ggml_tensor * dev_square;
+    struct ggml_tensor * dev_eigenvector;
+
+    pca_model(struct ggml_tensor * t_input) {
+#ifdef GGML_USE_CUDA
+        fprintf(stderr, "%s: using CUDA backend\n", __func__);
+        backend = ggml_backend_cuda_init(0); // init device 0
+        if (!backend) {
+            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+        }
+#endif
+
+// TODO: enable Metal support when support for GGML_OP_SQRT is added
+// #ifdef GGML_USE_METAL
+//         fprintf(stderr, "%s: using Metal backend\n", __func__);
+//         backend = ggml_backend_metal_init();
+//         if (!backend) {
+//             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+//         }
+// #endif
+
+        // if there aren't GPU Backends fallback to CPU backend
+        if (!backend) {
+            backend = ggml_backend_cpu_init();
+        }
+
+        const int num_tensors = 4;
+        struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx = ggml_init(params);
+
+        auto n_samples = t_input->ne[0];
+        auto n_embd    = t_input->ne[1];
+
+        dev_input       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
+        dev_square      = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,    n_embd);
+        dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        ggml_set_name(dev_input,       "dev_input");
+        ggml_set_name(dev_square,      "dev_square");
+        ggml_set_name(dev_eigenvector, "dev_eigenvector");
+        buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+        ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
+
+        // initialize eigenvector to random normalized vector
+        {
+            std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
+            std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
+            std::uniform_real_distribution<float> distribution(0.0, 1.0);
+            float sum_sqr = 0.0; // for normalizing random_vec
+            for (size_t i = 0; i < random_vec.size(); ++i) {
+                float f = distribution(generator);
+                sum_sqr += f * f;
+                random_vec[i] = f;
+            }
+            // normalize it
+            float random_vec_norm = std::sqrt(sum_sqr);
+            for (size_t i = 0; i < random_vec.size(); ++i) {
+                random_vec[i] /= random_vec_norm;
+            }
+            ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
+        }
+    }
+
+    ~pca_model() {
+        ggml_free(ctx);
+        ggml_backend_buffer_free(buffer);
+        ggml_backend_free(backend);
+    }
+};
+
+static struct ggml_cgraph * build_graph_piter(
+        const struct pca_params & params,
+        const pca_model & model,
+        bool calc_square = false) {
+    GGML_ASSERT(params.n_batch > 0);
+    // TODO: buf_size must be able to scale with params.n_batch
+    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    // turn v_diff_original into square matrix if needed
+    struct ggml_tensor * tmp_square;
+    if (calc_square) {
+        tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
+        ggml_set_name(tmp_square, "tmp_square");
+    }
+
+    struct ggml_tensor * b_tensor;
+    struct ggml_tensor * distance;
+    struct ggml_tensor * old_eigen    = model.dev_eigenvector;
+    struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
+
+    for (int i = 0; i < params.n_batch; ++i) {
+        // b_tensor = square * eigenvector^T
+        b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
+        ggml_set_name(b_tensor, "b_tensor");
+
+        // normalize
+        b_tensor = ggml_div_inplace(ctx0,
+            b_tensor,
+            ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
+        );
+        ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
+
+        // calculate distance(new eigenvector - old eigenvector)
+        // we don't use ggml_sub because it may not be implemented on GPU backend
+        struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
+        distance = ggml_sqrt_inplace(ctx0,
+            ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
+        ggml_format_name(distance, "distance_%d", i);
+
+        old_eigen = b_tensor;
+
+        // build operations nodes
+        ggml_build_forward_expand(gf, distance);
+    }
+
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+
+static ggml_status compute_piter(
+        const struct pca_params & params,
+        const pca_model & model,
+        struct ggml_cgraph * gf,
+        ggml_gallocr_t allocr,
+        struct pca_result & result) {
+    // allocate tensors
+    ggml_gallocr_alloc_graph(allocr, gf);
+
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
+    }
+
+    ggml_status res = ggml_backend_graph_compute(model.backend, gf);
+    if (res == GGML_STATUS_SUCCESS) {
+        auto extract_i = [](std::string prefix, std::string str) -> int {
+            int i = -1;
+            if (str.rfind(prefix, 0) == 0) {
+                sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
+            }
+            return i;
+        };
+        result.calculated_square = NULL;
+        result.eigenvectors.clear();
+        result.distances.clear();
+        result.eigenvectors.resize(params.n_batch);
+        result.distances.resize(params.n_batch);
+        // get output nodes
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            auto node = ggml_graph_node(gf, i);
+            int iter = -1;
+            // find b_tensor (without copying data from device)
+            if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
+                result.eigenvectors[iter] = node;
+            }
+            // find distances, then copy data from device
+            if ((iter = extract_i("distance_", node->name)) > -1) {
+                float d;
+                ggml_backend_tensor_get(node, &d, 0, sizeof(float));
+                result.distances[iter] = d;
+                // std::cout << node->name << " = " << d << "\n";
+            }
+            // find tmp_square if it exists (without copying data from device)
+            if (std::string(node->name) == "tmp_square") {
+                result.calculated_square = node;
+            }
+        }
+    }
+    return res;
+}
+
+static void power_iteration(
+        const struct pca_params & params,
+        struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
+        struct ggml_tensor * output) {
+    //printf("in power iteration\n");
+    struct pca_model model(input);
+
+    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
+    struct pca_result result;
+    struct ggml_tensor * last_eigenvector = NULL;
+
+    int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
+    for (int iter = 0; iter < n_iters; ++iter) {
+        bool calc_square = (iter == 0); // only need to calculate square for first iteration
+        struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
+        // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
+        compute_piter(params, model, gf, allocr, result);
+
+        for (size_t k = 0; k < result.distances.size(); ++k) {
+            last_eigenvector = result.eigenvectors[k];
+            if (result.distances[k] < params.tolerance) {
+                break; // done
+            }
+        }
+
+        if (calc_square) {
+            // copy and store the square matrix if needed
+            GGML_ASSERT(result.calculated_square != NULL);
+            ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
+        }
+
+        {
+            // copy last eigen vector and store as input for next iteration
+            GGML_ASSERT(last_eigenvector != NULL);
+            ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
+        }
+
+        printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
+            __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
+    }
+
+    // get output tensor
+    GGML_ASSERT(last_eigenvector);
+    ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
+    //print_debug_tensor(output);
+    ggml_gallocr_free(allocr);
+
+    // TODO @ngxson : The output vector is randomly inverted
+    // Solution: https://github.com/ggml-org/llama.cpp/pull/8069#issuecomment-2185328171
+}
+
+static void run_pca(
+        struct pca_params & params,
+        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
+        const std::vector<struct ggml_tensor *> & v_output) {
+    printf("%s: Running PCA...\n", __func__);
+    for (size_t il = 0; il < v_input.size(); ++il) {
+
+        // prepare output vector
+        struct ggml_tensor * ctrl_out = v_output[il];
+        ggml_format_name(ctrl_out, "direction.%zu", il+1);
+
+        // run power_iteration
+        params.i_layer = il;
+        params.n_layers = v_input.size();
+        power_iteration(params, v_input[il], ctrl_out);
+        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
+    }
+}
+
+}
diff --git a/llama.cpp/tools/cvector-generator/positive.txt b/llama.cpp/tools/cvector-generator/positive.txt
new file mode 100644
index 0000000..fea7362
--- /dev/null
+++ b/llama.cpp/tools/cvector-generator/positive.txt
@@ -0,0 +1,4 @@
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
+<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
+<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!
\ No newline at end of file
diff --git a/llama.cpp/tools/export-lora/CMakeLists.txt b/llama.cpp/tools/export-lora/CMakeLists.txt
new file mode 100644
index 0000000..cddfa77
--- /dev/null
+++ b/llama.cpp/tools/export-lora/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-export-lora)
+add_executable(${TARGET} export-lora.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/llama.cpp/tools/export-lora/README.md b/llama.cpp/tools/export-lora/README.md
new file mode 100644
index 0000000..7dce99c
--- /dev/null
+++ b/llama.cpp/tools/export-lora/README.md
@@ -0,0 +1,33 @@
+# export-lora
+
+Apply LORA adapters to base model and export the resulting model.
+
+```
+usage: llama-export-lora [options]
+
+options:
+  -m,    --model                  model path from which to load base model (default '')
+         --lora FNAME             path to LoRA adapter  (can be repeated to use multiple adapters)
+         --lora-scaled FNAME S    path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)
+  -t,    --threads N              number of threads to use during computation (default: 4)
+  -o,    --output FNAME           output file (default: 'ggml-lora-merged-f16.gguf')
+```
+
+For example:
+
+```bash
+./bin/llama-export-lora \
+    -m open-llama-3b-v2.gguf \
+    -o open-llama-3b-v2-english2tokipona-chat.gguf \
+    --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf
+```
+
+Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
+
+```bash
+./bin/llama-export-lora \
+    -m your_base_model.gguf \
+    -o your_merged_model.gguf \
+    --lora-scaled lora_task_A.gguf 0.5 \
+    --lora-scaled lora_task_B.gguf 0.5
+```
diff --git a/llama.cpp/tools/export-lora/export-lora.cpp b/llama.cpp/tools/export-lora/export-lora.cpp
new file mode 100644
index 0000000..41f4262
--- /dev/null
+++ b/llama.cpp/tools/export-lora/export-lora.cpp
@@ -0,0 +1,434 @@
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "gguf.h"
+
+#include "arg.h"
+#include "common.h"
+
+#include <map>
+#include <vector>
+#include <string>
+#include <fstream>
+
+static bool g_verbose = false;
+
+struct tensor_transformation {
+    struct ggml_tensor * in;
+    struct ggml_tensor * out;
+    bool is_copy;
+};
+
+static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
+    int id = gguf_find_key(ctx_gguf, key.c_str());
+    return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
+}
+
+static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
+    int id = gguf_find_key(ctx_gguf, key.c_str());
+    return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
+}
+
+static void zeros(std::ofstream & file, size_t n) {
+    char zero = 0;
+    for (size_t i = 0; i < n; ++i) {
+        file.write(&zero, 1);
+    }
+}
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ ctx_ggml,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
+    if (!ctx_gguf) {
+        throw std::runtime_error("failed to load input GGUF from " + fname);
+    }
+    return ctx_gguf;
+}
+
+struct file_input {
+    struct ggml_context * ctx_meta = nullptr;
+    struct gguf_context * ctx_gguf = nullptr;
+    std::ifstream f_in;
+    std::map<std::string, ggml_tensor *> tensors;
+    float alpha;
+    float scale;
+
+    file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
+        if (!f_in.is_open()) {
+            throw std::runtime_error("failed to open input gguf from " + fname);
+        }
+
+        ctx_gguf = load_gguf(fname, &ctx_meta);
+        alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
+        printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
+
+        for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
+            std::string name(cur->name);
+            tensors[name] = cur;
+            if (g_verbose) {
+                printf("%s: %s\n", __func__, cur->name);
+            }
+        }
+    }
+
+    ggml_tensor * get_tensor(std::string name) {
+        if (tensors.find(name) == tensors.end()) {
+            return nullptr;
+        }
+        return tensors[name];
+    }
+
+    void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
+        if (tensors.find(name) == tensors.end()) {
+            throw std::runtime_error("cannot find tensor with name: " + name);
+        }
+        auto len = ggml_nbytes(tensors[name]);
+        if (buf.size() < len) {
+            buf.resize(len);
+        }
+        auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
+        auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
+        f_in.seekg(offset);
+        f_in.read((char* )buf.data(), len);
+    }
+
+    ~file_input() {
+        gguf_free(ctx_gguf);
+        ggml_free(ctx_meta);
+    }
+};
+
+struct lora_merge_ctx {
+    // input base model + adapters
+    file_input base_model;
+    std::vector<std::unique_ptr<file_input>> adapters;
+
+    // for computing merged tensor
+    int n_threads;
+    ggml_backend_t backend = nullptr;
+    ggml_gallocr_t allocr = nullptr;
+    std::vector<uint8_t> read_buf;
+
+    // output file
+    struct gguf_context * ctx_out;
+    struct ggml_context * ctx_out_ggml;
+    std::ofstream fout;
+
+    lora_merge_ctx(
+            std::string & base_fname,
+            std::vector<common_adapter_lora_info> & lora_files,
+            std::string & outfile,
+            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
+        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+
+        if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
+            throw std::runtime_error("split model is not yet supported");
+        }
+
+        for (auto & lora_inp : lora_files) {
+            auto fname = lora_inp.path;
+            auto scale = lora_inp.scale;
+            std::unique_ptr<file_input> adapter(new file_input(fname, scale));
+            check_metadata_lora(adapter.get());
+            adapters.push_back(std::move(adapter));
+        }
+
+        ctx_out = gguf_init_empty();
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ static_cast<size_t>(gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead()),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_out_ggml = ggml_init(params);
+        backend = ggml_backend_cpu_init();
+        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    }
+
+    void check_metadata_lora(file_input * adapter) {
+        auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
+        if (general_type != "adapter") {
+            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
+        }
+
+        auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
+        if (adapter_type != "lora") {
+            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
+        }
+
+        auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
+        auto general_arch_lora = get_kv_str(adapter->ctx_gguf,   "general.architecture");
+        if (general_arch_base != general_arch_lora) {
+            throw std::runtime_error("model arch and LoRA arch mismatch");
+        }
+    }
+
+    ggml_type get_out_tensor_type(struct ggml_tensor * t) {
+        if (t->type == GGML_TYPE_F32) {
+            return GGML_TYPE_F32;
+        } else {
+            return GGML_TYPE_F16;
+        }
+    }
+
+    void run_merge() {
+        // prepare metadata
+        gguf_set_kv(ctx_out, base_model.ctx_gguf);
+        // output is forced to f16 for now
+        gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
+
+        // check if all lora adapters have the same tensors
+        // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggml-org/llama.cpp/pull/8607#discussion_r1686027777
+        static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
+        if (adapters.size() > 1) {
+            for (size_t i = 1; i < adapters.size(); ++i) {
+                if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
+                    throw std::runtime_error(err_no_subset_adapter);
+                }
+                for (auto & it : adapters[i]->tensors) {
+                    if (adapters[0]->get_tensor(it.first) == nullptr) {
+                        throw std::runtime_error(err_no_subset_adapter);
+                    }
+                }
+            }
+        }
+
+        // mapping base tensor to out tensor (same shape with base, but different type)
+        std::vector<tensor_transformation> trans;
+        for (auto & it : base_model.tensors) {
+            bool t_a = true;
+            bool t_b = true;
+            for (auto & adapter : adapters) {
+                t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
+                t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
+            }
+            auto base_tensor = it.second;
+            if (!t_a && !t_b) {
+                // only copy
+                struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
+                ggml_set_name(cpy_tensor, base_tensor->name);
+                trans.push_back({
+                    cpy_tensor,
+                    cpy_tensor,
+                    true,
+                });
+                gguf_add_tensor(ctx_out, cpy_tensor);
+            } else if (t_a && t_b) {
+                // need merging
+                struct ggml_tensor * out_tensor = ggml_new_tensor(
+                    ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
+                ggml_set_name(out_tensor, base_tensor->name);
+                trans.push_back({
+                    base_tensor,
+                    out_tensor,
+                    false,
+                });
+                gguf_add_tensor(ctx_out, out_tensor);
+            } else {
+                throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
+            }
+        }
+
+        // placeholder for the meta data
+        {
+            size_t meta_size = gguf_get_meta_size(ctx_out);
+            zeros(fout, meta_size);
+        }
+
+        // process base model tensors
+        size_t n_merged = 0;
+        for (auto & it : trans) {
+            if (!it.is_copy) {
+                merge_tensor(it.in, it.out);
+                n_merged++;
+            } else {
+                copy_tensor(it.in);
+            }
+        }
+
+        // write output metadata
+        {
+            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+            gguf_get_meta_data(ctx_out, data.data());
+            fout.seekp(0);
+            fout.write((const char *)data.data(), data.size());
+        }
+
+        printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged);
+        printf("%s : wrote %zu tensors to output file\n", __func__, trans.size());
+    }
+
+    void copy_tensor(struct ggml_tensor * base) {
+        printf("%s :  %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
+        size_t len = ggml_nbytes(base);
+        base_model.read_tensor_data(base->name, read_buf);
+        fout.write((char* )read_buf.data(), len);
+        zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
+    }
+
+    void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
+        std::string name_base(base->name);
+        std::string name_lora_a = name_base + ".lora_a";
+        std::string name_lora_b = name_base + ".lora_b";
+
+        printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
+
+        // context for input tensor
+        std::vector<struct ggml_tensor *> inp_a(adapters.size());
+        std::vector<struct ggml_tensor *> inp_b(adapters.size());
+        struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        struct ggml_context * ctx = ggml_init(params);
+
+        // alloc tensors
+        struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
+        for (size_t i = 0; i < adapters.size(); ++i) {
+            auto t_a = adapters[i]->get_tensor(name_lora_a);
+            auto t_b = adapters[i]->get_tensor(name_lora_b);
+            // TODO: add support for quantized lora
+            if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) {
+                throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32");
+            }
+            inp_a[i] = ggml_dup_tensor(ctx, t_a);
+            inp_b[i] = ggml_dup_tensor(ctx, t_b);
+        }
+        ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+
+        // load base tensor to backend buffer
+        base_model.read_tensor_data(name_base, read_buf);
+        if (base->type != GGML_TYPE_F32) {
+            // optionally dequantize it
+            printf("%s :   + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
+            auto nels = ggml_nelements(inp_base);
+            const auto * qtype = ggml_get_type_traits(base->type);
+            std::vector<uint8_t> dequant_buf(nels * sizeof(float));
+            qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
+            ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
+        } else {
+            ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
+        }
+
+        // load lora tensors to backend buffer
+        for (size_t i = 0; i < adapters.size(); ++i) {
+            adapters[i]->read_tensor_data(name_lora_a, read_buf);
+            ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
+            adapters[i]->read_tensor_data(name_lora_b, read_buf);
+            ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
+        }
+
+        // build graph
+        struct ggml_cgraph * gf;
+        {
+            static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+            static std::vector<uint8_t> buf(buf_size);
+            struct ggml_init_params params0 = {
+                /*.mem_size   =*/ buf_size,
+                /*.mem_buffer =*/ buf.data(),
+                /*.no_alloc   =*/ true,
+            };
+            struct ggml_context * ctx0 = ggml_init(params0);
+            gf = ggml_new_graph(ctx0);
+            struct ggml_tensor * cur = inp_base;
+            for (size_t i = 0; i < adapters.size(); ++i) {
+                struct ggml_tensor * delta;
+                bool is_tok_embd = string_starts_with(name_base, "token_embd");
+                if (is_tok_embd) {
+                    printf("%s :     detected token embeddings tensor\n", __func__);
+                    delta = ggml_mul_mat(ctx0,
+                        ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32),
+                        ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32));
+                } else {
+                    delta = ggml_mul_mat(ctx0,
+                        ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))),
+                        ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
+                }
+                // scale
+                const float alpha = adapters[i]->alpha;
+                const float rank  = (float) inp_b[i]->ne[0];
+                const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
+                delta = ggml_scale(ctx0, delta, scale);
+                cur = ggml_add(ctx0, delta, cur);
+                printf("%s :   + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
+                printf("%s :     input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
+            }
+            cur = ggml_cast(ctx0, cur, out->type);
+            printf("%s :   + output type is %s\n", __func__, ggml_type_name(out->type));
+            ggml_build_forward_expand(gf, cur);
+            ggml_free(ctx0);
+        }
+
+        // compute
+        {
+            ggml_gallocr_alloc_graph(allocr, gf);
+            ggml_backend_cpu_set_n_threads(backend, n_threads);
+            ggml_backend_graph_compute(backend, gf);
+        }
+
+        // write data to output file
+        {
+            auto * result = ggml_graph_node(gf, -1);
+            size_t len = ggml_nbytes(result);
+            if (read_buf.size() < len) {
+                read_buf.resize(len);
+            }
+            ggml_backend_tensor_get(result, read_buf.data(), 0, len);
+            fout.write((char* )read_buf.data(), len);
+            zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
+        }
+
+        ggml_free(ctx);
+        ggml_backend_buffer_free(buffer);
+    }
+
+    ~lora_merge_ctx() {
+        ggml_gallocr_free(allocr);
+        ggml_backend_free(backend);
+        gguf_free(ctx_out);
+        ggml_free(ctx_out_ggml);
+    }
+};
+
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
+    printf("\nNOTE: output model is F16\n");
+    printf("\n");
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.out_file = "ggml-lora-merged-f16.gguf";
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
+        return 1;
+    }
+
+    g_verbose = (params.verbosity > 1);
+    try {
+        lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
+        ctx.run_merge();
+    } catch (const std::exception & err) {
+        fprintf(stderr, "%s\n", err.what());
+        exit(EXIT_FAILURE);
+    }
+
+    printf("done, output file is %s\n", params.out_file.c_str());
+
+    return 0;
+}
diff --git a/llama.cpp/tools/fit-params/CMakeLists.txt b/llama.cpp/tools/fit-params/CMakeLists.txt
new file mode 100644
index 0000000..34c3373
--- /dev/null
+++ b/llama.cpp/tools/fit-params/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-fit-params)
+add_executable(${TARGET} fit-params.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/llama.cpp/tools/fit-params/README.md b/llama.cpp/tools/fit-params/README.md
new file mode 100644
index 0000000..8f0c958
--- /dev/null
+++ b/llama.cpp/tools/fit-params/README.md
@@ -0,0 +1,55 @@
+# fit-params
+
+llama.cpp binaries can automatically fit the projected memory use of a model to the free device memory available at runtime,
+this is controlled using the CLI arguments starting with `-fit`/`--fit`.
+Internally the code is calling `llama_params_fit` to adjust the `llama_model_params` and `llama_context_params` structs.
+`llama-fit-params` is a simple utility that prints the CLI arguments corresponding to these adjustments to stdout.
+Example usage:
+
+``` bash
+# First, run llama-fit-params and store the results in a file:
+> ./build/bin/llama-fit-params --model /opt/models/qwen_3-30b3a-f16.gguf | tee args.txt
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 CUDA devices:
+  Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
+build: 6895 (4341dc8bc) with cc (GCC) 15.2.1 20250813 for x86_64-pc-linux-gnu
+llama_params_fit_impl: projected to use 61807 MiB of device memory vs. 24077 MiB of free device memory
+llama_params_fit_impl: cannot fulfill margin of 1024 MiB, need to reduce device memory by 42444 MiB
+llama_params_fit_impl: context size reduced from 40960 to 4096 -> need 3456 MiB less memory in total
+llama_params_fit_impl: with only dense weights in device memory there is a total surplus of 16164 MiB
+llama_params_fit_impl: distributing layers across devices with overflow to next device/system memory:
+llama_params_fit_impl:   - CUDA0 (NVIDIA GeForce RTX 4090): 48 layers (34 overflowing),  19187 MiB used,   1199 MiB free
+llama_params_fit: successfully fit params to free device memory
+llama_params_fit: fitting params to free memory took 1.15 seconds
+Printing fitted CLI arguments to stdout...
+-c 4096 -ngl 48 -ot blk\.14\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.15\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.16\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.17\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.18\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.19\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.20\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.21\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.22\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.23\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.24\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.25\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.26\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.27\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.28\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.29\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.30\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.31\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.32\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.33\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.34\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.35\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.36\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.37\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.38\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.39\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.40\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.41\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.42\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.43\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.44\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.45\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.46\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.47\.ffn_(up|down|gate)_(ch|)exps=CPU
+
+# Next, use those results for a llama.cpp binary:
+> cat args.txt | xargs ./build/bin/llama-server --model /opt/models/qwen_3-30b3a-f16.gguf
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 CUDA devices:
+  Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
+build: 6895 (4341dc8bc) with cc (GCC) 15.2.1 20250813 for x86_64-pc-linux-gnu
+system info: n_threads = 16, n_threads_batch = 16, total_threads = 32
+
+system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 890 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX_VNNI = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
+
+main: binding port with default address family
+main: HTTP server is listening, hostname: 127.0.0.1, port: 8080, http threads: 31
+main: loading model
+srv    load_model: loading model '/opt/models/qwen_3-30b3a-f16.gguf'
+llama_params_fit_impl: projected to use 19187 MiB of device memory vs. 24077 MiB of free device memory
+llama_params_fit_impl: will leave 1199 >= 1024 MiB of free device memory, no changes needed
+llama_params_fit: successfully fit params to free device memory
+llama_params_fit: fitting params to free memory took 0.28 seconds
+[...]
+main: server is listening on http://127.0.0.1:8080 - starting the main loop
+srv  update_slots: all slots are idle
+^Csrv    operator(): operator(): cleaning up before exit...
+
+llama_memory_breakdown_print: | memory breakdown [MiB] | total   free     self   model   context   compute    unaccounted |
+llama_memory_breakdown_print: |   - CUDA0 (RTX 4090)   | 24077 =  945 + (19187 = 17904 +     384 +     898) +        3945 |
+llama_memory_breakdown_print: |   - Host               |                 58271 = 58259 +       0 +      12                |
+```
diff --git a/llama.cpp/tools/fit-params/fit-params.cpp b/llama.cpp/tools/fit-params/fit-params.cpp
new file mode 100644
index 0000000..0176be0
--- /dev/null
+++ b/llama.cpp/tools/fit-params/fit-params.cpp
@@ -0,0 +1,66 @@
+#include "llama.h"
+
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+
+#include <chrono>
+#include <cinttypes>
+#include <thread>
+
+using namespace std::chrono_literals;
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+        return 1;
+    }
+
+    common_init();
+    llama_backend_init();
+    llama_numa_init(params.numa);
+    auto mparams = common_model_params_to_llama(params);
+    auto cparams = common_context_params_to_llama(params);
+    const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
+        params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
+        params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
+    if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
+        LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
+        exit(1);
+    }
+
+    LOG_INF("%s: printing fitted CLI arguments to stdout...\n", __func__);
+    common_log_flush(common_log_main());
+    printf("-c %" PRIu32 " -ngl %" PRIi32, cparams.n_ctx, mparams.n_gpu_layers);
+
+    size_t nd = llama_max_devices();
+    while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) {
+        nd--;
+    }
+    if (nd > 1) {
+        for (size_t id = 0; id < nd; id++) {
+            if (id == 0) {
+                printf(" -ts ");
+            }
+            printf("%s%" PRIu32, id > 0 ? "," : "", uint32_t(mparams.tensor_split[id]));
+        }
+    }
+
+    const size_t ntbo = llama_max_tensor_buft_overrides();
+    bool any_tbo = false;
+    for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) {
+        if (itbo == 0) {
+            printf(" -ot \"");
+        }
+        printf("%s%s=%s", itbo > 0 ? "," : "", mparams.tensor_buft_overrides[itbo].pattern, ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft));
+        any_tbo = true;
+    }
+    printf("%s\n", any_tbo ? "\"" : "");
+
+    return 0;
+}
diff --git a/llama.cpp/tools/gguf-split/CMakeLists.txt b/llama.cpp/tools/gguf-split/CMakeLists.txt
new file mode 100644
index 0000000..9b21250
--- /dev/null
+++ b/llama.cpp/tools/gguf-split/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-gguf-split)
+add_executable(${TARGET} gguf-split.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/llama.cpp/tools/gguf-split/README.md b/llama.cpp/tools/gguf-split/README.md
new file mode 100644
index 0000000..ad1d866
--- /dev/null
+++ b/llama.cpp/tools/gguf-split/README.md
@@ -0,0 +1,10 @@
+## GGUF split Example
+
+CLI to split / merge GGUF files.
+
+**Command line options:**
+
+- `--split`: split GGUF to multiple GGUF, default operation.
+- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
+- `--split-max-tensors`: maximum tensors in each split: default(128)
+- `--merge`: merge multiple GGUF to a single GGUF.
diff --git a/llama.cpp/tools/gguf-split/gguf-split.cpp b/llama.cpp/tools/gguf-split/gguf-split.cpp
new file mode 100644
index 0000000..30e7715
--- /dev/null
+++ b/llama.cpp/tools/gguf-split/gguf-split.cpp
@@ -0,0 +1,583 @@
+#include "ggml.h"
+#include "gguf.h"
+#include "llama.h"
+#include "common.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <stdexcept>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#if defined(_WIN32)
+    #include <windows.h>
+    #ifndef PATH_MAX
+        #define PATH_MAX MAX_PATH
+    #endif
+    #include <io.h>
+#endif
+
+enum split_operation : uint8_t {
+    OP_NONE,
+    OP_SPLIT,
+    OP_MERGE,
+};
+
+enum split_mode : uint8_t {
+    MODE_NONE,
+    MODE_TENSOR,
+    MODE_SIZE,
+};
+
+struct split_params {
+    split_operation operation = OP_NONE;
+    split_mode mode = MODE_NONE;
+    size_t n_bytes_split = 0;
+    int n_split_tensors = 128;
+    std::string input;
+    std::string output;
+    bool no_tensor_first_split = false;
+    bool dry_run = false;
+};
+
+static void split_print_usage(const char * executable) {
+    const split_params default_params;
+    printf("\n");
+    printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable);
+    printf("\n");
+    printf("Apply a GGUF operation on IN to OUT.");
+    printf("\n");
+    printf("options:\n");
+    printf("  -h, --help              show this help message and exit\n");
+    printf("  --version               show version and build info\n");
+    printf("  --split                 split GGUF to multiple GGUF (enabled by default)\n");
+    printf("  --merge                 merge multiple GGUF to a single GGUF\n");
+    printf("  --split-max-tensors     max tensors in each split (default: %d)\n", default_params.n_split_tensors);
+    printf("  --split-max-size N(M|G) max size per split\n");
+    printf("  --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
+    printf("  --dry-run               only print out a split plan and exit, without writing any new files\n");
+    printf("\n");
+}
+
+// return convert string, for example "128M" or "4G" to number of bytes
+static size_t split_str_to_n_bytes(std::string str) {
+    size_t n_bytes = 0;
+    int n;
+    if (str.back() == 'M') {
+        sscanf(str.c_str(), "%d", &n);
+        n_bytes = (size_t)n * 1000 * 1000; // megabytes
+    } else if (str.back() == 'G') {
+        sscanf(str.c_str(), "%d", &n);
+        n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
+    } else {
+        throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
+    }
+    if (n <= 0) {
+        throw std::invalid_argument("error: size must be a positive value");
+    }
+    return n_bytes;
+}
+
+static void split_params_parse_ex(int argc, const char ** argv, split_params & params) {
+    std::string arg;
+    const std::string arg_prefix = "--";
+    bool invalid_param = false;
+
+    int arg_idx = 1;
+    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
+        arg = argv[arg_idx];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        bool arg_found = false;
+        if (arg == "-h" || arg == "--help") {
+            split_print_usage(argv[0]);
+            exit(0);
+        } else if (arg == "--version") {
+            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            exit(0);
+        } else if (arg == "--dry-run") {
+            arg_found = true;
+            params.dry_run = true;
+        } else if (arg == "--no-tensor-first-split") {
+            arg_found = true;
+            params.no_tensor_first_split = true;
+        } else if (arg == "--merge") {
+            arg_found = true;
+            if (params.operation != OP_NONE && params.operation != OP_MERGE) {
+                throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
+            }
+            params.operation = OP_MERGE;
+        } else if (arg == "--split") {
+            arg_found = true;
+            if (params.operation != OP_NONE && params.operation != OP_SPLIT) {
+                throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
+            }
+            params.operation = OP_SPLIT;
+        } else if (arg == "--split-max-tensors") {
+            if (++arg_idx >= argc) {
+                invalid_param = true;
+                break;
+            }
+            arg_found = true;
+            if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) {
+                throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
+            }
+            params.mode = MODE_TENSOR;
+            params.n_split_tensors = atoi(argv[arg_idx]);
+        } else if (arg == "--split-max-size") {
+            if (++arg_idx >= argc) {
+                invalid_param = true;
+                break;
+            }
+            arg_found = true;
+            if (params.mode != MODE_NONE && params.mode != MODE_SIZE) {
+                throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
+            }
+            params.mode = MODE_SIZE;
+            params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
+        }
+
+        if (!arg_found) {
+            throw std::invalid_argument("error: unknown argument: " + arg);
+        }
+    }
+
+    // the operation is split if not specified
+    if (params.operation == OP_NONE) {
+        params.operation = OP_SPLIT;
+    }
+    // the split mode is by tensor if not specified
+    if (params.mode == MODE_NONE) {
+        params.mode = MODE_TENSOR;
+    }
+
+    if (invalid_param) {
+        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
+    }
+
+    if (argc - arg_idx != 2) {
+        throw std::invalid_argument("error: bad arguments");
+    }
+
+    params.input = argv[arg_idx++];
+    params.output = argv[arg_idx++];
+}
+
+static bool split_params_parse(int argc, const char ** argv, split_params & params) {
+    bool result = true;
+    try {
+        split_params_parse_ex(argc, argv, params);
+    }
+    catch (const std::invalid_argument & ex) {
+        fprintf(stderr, "%s\n", ex.what());
+        split_print_usage(argv[0]);
+        exit(EXIT_FAILURE);
+    }
+    return result;
+}
+
+static void zeros(std::ofstream & file, size_t n) {
+    char zero = 0;
+    for (size_t i = 0; i < n; ++i) {
+        file.write(&zero, 1);
+    }
+}
+
+struct split_strategy {
+    const split_params params;
+    std::ifstream & f_input;
+    struct gguf_context * ctx_gguf;
+    struct ggml_context * ctx_meta = NULL;
+    const int n_tensors;
+
+    // one ctx_out per one output file
+    std::vector<struct gguf_context *> ctx_outs;
+
+    // temporary buffer for reading in tensor data
+    std::vector<uint8_t> read_buf;
+
+    split_strategy(const split_params & params,
+            std::ifstream & f_input,
+            struct gguf_context * ctx_gguf,
+            struct ggml_context * ctx_meta) :
+        params(params),
+        f_input(f_input),
+        ctx_gguf(ctx_gguf),
+        ctx_meta(ctx_meta),
+        n_tensors(gguf_get_n_tensors(ctx_gguf)) {
+
+        // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
+        int i_split = -1;
+        struct gguf_context * ctx_out = NULL;
+        auto new_ctx_out = [&](bool allow_no_tensors) {
+            i_split++;
+            if (ctx_out != NULL) {
+                if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
+                    fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
+                    exit(EXIT_FAILURE);
+                }
+                ctx_outs.push_back(ctx_out);
+            }
+            ctx_out = gguf_init_empty();
+            // Save all metadata in first split only
+            if (i_split == 0) {
+                gguf_set_kv(ctx_out, ctx_gguf);
+            }
+            gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
+            gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, 0); // placeholder
+            gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
+        };
+
+        // initialize ctx_out for the first split
+        new_ctx_out(false);
+
+        // skip first split if no_tensor_first_split is set
+        if (params.no_tensor_first_split) {
+            new_ctx_out(true);
+        }
+
+        // process tensors one by one
+        size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
+        for (int i = 0; i < n_tensors; ++i) {
+            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
+            // calculate the "imaginary" size = the current size + next tensor size
+            size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
+            size_t next_tensors_size = curr_tensors_size + n_bytes;
+            if (should_split(i, next_tensors_size)) {
+                new_ctx_out(false);
+                curr_tensors_size = n_bytes;
+            } else {
+                curr_tensors_size = next_tensors_size;
+            }
+            gguf_add_tensor(ctx_out, t);
+        }
+
+        // push the last ctx_out
+        ctx_outs.push_back(ctx_out);
+
+        // set the correct n_split for all ctx_out
+        for (auto & ctx : ctx_outs) {
+            gguf_set_val_u16(ctx, LLM_KV_SPLIT_COUNT, ctx_outs.size());
+        }
+    }
+
+    ~split_strategy() {
+        for (auto & ctx_out : ctx_outs) {
+            gguf_free(ctx_out);
+        }
+    }
+
+    bool should_split(int i_tensor, size_t next_size) {
+        if (params.mode == MODE_SIZE) {
+            // split by max size per file
+            return next_size > params.n_bytes_split;
+        } else if (params.mode == MODE_TENSOR) {
+            // split by number of tensors per file
+            return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
+        }
+        // should never happen
+        GGML_ABORT("invalid mode");
+    }
+
+    void print_info() {
+        printf("n_split: %zu\n", ctx_outs.size());
+        int i_split = 0;
+        for (auto & ctx_out : ctx_outs) {
+            // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
+            size_t total_size = gguf_get_meta_size(ctx_out);
+            for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
+                struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
+                total_size += ggml_nbytes(t);
+            }
+            total_size = total_size / 1000 / 1000; // convert to megabytes
+            printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
+            i_split++;
+        }
+    }
+
+    void write() {
+        int i_split = 0;
+        int n_split = ctx_outs.size();
+        for (auto & ctx_out : ctx_outs) {
+            // construct file path
+            char split_path[PATH_MAX] = {0};
+            llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
+
+            // open the output file
+            printf("Writing file %s ... ", split_path);
+            fflush(stdout);
+            std::ofstream fout = std::ofstream(split_path, std::ios::binary);
+            fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+
+            // write metadata
+            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+            gguf_get_meta_data(ctx_out, data.data());
+            fout.write((const char *)data.data(), data.size());
+
+            // write tensors
+            for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
+                // read tensor meta and prepare buffer
+                const char * t_name = gguf_get_tensor_name(ctx_out, i);
+                struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+                auto n_bytes = ggml_nbytes(t);
+                read_buf.resize(n_bytes);
+
+                // calculate offset
+                auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file
+                auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
+
+                // copy tensor from input to output file
+                copy_file_to_file(f_input, fout, offset, n_bytes);
+                zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
+            }
+
+            printf("done\n");
+            // close the file
+            fout.close();
+            i_split++;
+        }
+    }
+
+    void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) {
+        // TODO: detect OS and use copy_file_range() here for better performance
+        if (read_buf.size() < len) {
+            read_buf.resize(len);
+        }
+        f_in.seekg(in_offset);
+        f_in.read((char *)read_buf.data(), len);
+        f_out.write((const char *)read_buf.data(), len);
+    }
+};
+
+static void gguf_split(const split_params & split_params) {
+    struct ggml_context * ctx_meta = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ &ctx_meta,
+    };
+
+    std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
+    if (!f_input.is_open()) {
+        fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
+        exit(EXIT_FAILURE);
+    }
+
+    auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
+    if (!ctx_gguf) {
+        fprintf(stderr, "%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
+        exit(EXIT_FAILURE);
+    }
+
+    // prepare the strategy
+    split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
+    int n_split = strategy.ctx_outs.size();
+    strategy.print_info();
+
+    if (!split_params.dry_run) {
+        // write all output splits
+        strategy.write();
+    }
+
+    // done, clean up
+    gguf_free(ctx_gguf);
+    f_input.close();
+
+    fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n",
+            __func__, n_split, strategy.n_tensors);
+}
+
+static void gguf_merge(const split_params & split_params) {
+    fprintf(stderr, "%s: %s -> %s\n",
+            __func__, split_params.input.c_str(),
+            split_params.output.c_str());
+    int n_split = 1;
+    int total_tensors = 0;
+
+    // avoid overwriting existing output file
+    if (std::ifstream(split_params.output.c_str())) {
+        fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
+        exit(EXIT_FAILURE);
+    }
+
+
+    auto * ctx_out = gguf_init_empty();
+
+    std::vector<uint8_t> read_data;
+    std::vector<ggml_context *> ctx_metas;
+    std::vector<gguf_context *> ctx_ggufs;
+
+    char split_path[PATH_MAX] = {0};
+    strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
+    char split_prefix[PATH_MAX] = {0};
+
+    // First pass to find KV and tensors metadata
+    for (int i_split = 0; i_split < n_split; i_split++) {
+        struct ggml_context * ctx_meta = NULL;
+
+        struct gguf_init_params params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &ctx_meta,
+        };
+
+        if (i_split > 0) {
+            llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+        }
+        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
+
+        auto * ctx_gguf = gguf_init_from_file(split_path, params);
+        if (!ctx_gguf) {
+            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
+            exit(EXIT_FAILURE);
+        }
+        ctx_ggufs.push_back(ctx_gguf);
+        ctx_metas.push_back(ctx_meta);
+
+        if (i_split == 0) {
+            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+            if (key_n_split < 0) {
+                fprintf(stderr,
+                        "\n%s: input file does not contain %s metadata\n",
+                        __func__,
+                        LLM_KV_SPLIT_COUNT);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
+                gguf_free(ctx_out);
+                exit(EXIT_FAILURE);
+            }
+
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+            if (n_split < 1) {
+                fprintf(stderr,
+                        "\n%s: input file does not contain a valid split count %d\n",
+                        __func__,
+                        n_split);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
+                gguf_free(ctx_out);
+                exit(EXIT_FAILURE);
+            }
+
+            // Verify the file naming and extract split_prefix
+            if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
+                fprintf(stderr, "\n%s: unexpected input file name: %s"
+                                " i_split=%d"
+                                " n_split=%d\n", __func__,
+                        split_path, i_split, n_split);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
+                gguf_free(ctx_out);
+                exit(EXIT_FAILURE);
+            }
+
+            // Do not trigger merge if we try to merge again the output
+            gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
+
+            // Set metadata from the first split
+            gguf_set_kv(ctx_out, ctx_gguf);
+        }
+
+        auto n_tensors = gguf_get_n_tensors(ctx_gguf);
+        for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
+            const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
+            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+            gguf_add_tensor(ctx_out, t);
+        }
+        total_tensors += n_tensors;
+
+        fprintf(stderr, "\033[3Ddone\n");
+    }
+    std::ofstream fout;
+    if (!split_params.dry_run) {
+        fout.open(split_params.output.c_str(), std::ios::binary);
+        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+        // placeholder for the meta data
+        auto meta_size = gguf_get_meta_size(ctx_out);
+        ::zeros(fout, meta_size);
+    }
+
+    // Write tensors data
+    for (int i_split = 0; i_split < n_split; i_split++) {
+        llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+        std::ifstream f_input(split_path, std::ios::binary);
+        if (!f_input.is_open()) {
+            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_path);
+            for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
+                gguf_free(ctx_ggufs[i]);
+                ggml_free(ctx_metas[i]);
+            }
+            gguf_free(ctx_out);
+            if (!split_params.dry_run) {
+                fout.close();
+            }
+            exit(EXIT_FAILURE);
+        }
+        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
+
+        auto * ctx_gguf = ctx_ggufs[i_split];
+        auto * ctx_meta = ctx_metas[i_split];
+
+        auto n_tensors = gguf_get_n_tensors(ctx_gguf);
+        for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
+            const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
+            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+
+            auto n_bytes = ggml_nbytes(t);
+
+            if (read_data.size() < n_bytes) {
+                read_data.resize(n_bytes);
+            }
+
+            auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
+            f_input.seekg(offset);
+            f_input.read((char *)read_data.data(), n_bytes);
+            if (!split_params.dry_run) {
+                // write tensor data + padding
+                fout.write((const char *)read_data.data(), n_bytes);
+                zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
+            }
+        }
+
+        gguf_free(ctx_gguf);
+        ggml_free(ctx_meta);
+        f_input.close();
+        fprintf(stderr, "\033[3Ddone\n");
+    }
+
+    if (!split_params.dry_run) {
+        // go back to beginning of file and write the updated metadata
+        fout.seekp(0);
+        std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+        gguf_get_meta_data(ctx_out, data.data());
+        fout.write((const char *)data.data(), data.size());
+        fout.close();
+    }
+    gguf_free(ctx_out);
+
+    fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n",
+            __func__, split_params.output.c_str(), n_split, total_tensors);
+}
+
+int main(int argc, const char ** argv) {
+    split_params params;
+    split_params_parse(argc, argv, params);
+
+    switch (params.operation) {
+        case OP_SPLIT: gguf_split(params);
+            break;
+        case OP_MERGE: gguf_merge(params);
+            break;
+        default: split_print_usage(argv[0]);
+            exit(EXIT_FAILURE);
+    }
+
+    return 0;
+}
diff --git a/llama.cpp/tools/gguf-split/tests.sh b/llama.cpp/tools/gguf-split/tests.sh
new file mode 100755
index 0000000..c8dd0b0
--- /dev/null
+++ b/llama.cpp/tools/gguf-split/tests.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+    echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
+    echo "example: $0 ../../build/bin ../../tmp"
+    exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+    TMP_DIR=$2
+else
+    TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/llama-gguf-split
+MAIN=$1/llama-completion
+WORK_PATH=$TMP_DIR/gguf-split
+ROOT_DIR=$(realpath $(dirname $0)/../../)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
+
+# 1. Get a model
+(
+cd $WORK_PATH
+"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/Qwen3-0.6B-GGUF --file Qwen3-0.6B-Q8_0.gguf
+)
+echo PASS
+
+# 2. Split with max tensors strategy
+$SPLIT --split-max-tensors 28  $WORK_PATH/Qwen3-0.6B-Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 2b. Test the sharded model is loading properly
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00012.gguf -p "I believe the meaning of life is" --n-predict 32
+echo PASS
+echo
+
+# 3. Merge
+$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00012.gguf $WORK_PATH/ggml-model-merge.gguf
+echo PASS
+echo
+
+# 3b. Test the merged model is loading properly
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf -p "I believe the meaning of life is" --n-predict 32
+echo PASS
+echo
+
+# 4. Split with no tensors in the first split
+$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
+echo PASS
+echo
+
+# 4b. Test the sharded model is loading properly
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00011.gguf -p "I believe the meaning of life is" --n-predict 32
+echo PASS
+echo
+
+# 5. Merge
+#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00012.gguf $WORK_PATH/ggml-model-merge-2.gguf
+#echo PASS
+#echo
+
+# 5b. Test the merged model is loading properly
+#$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
+#echo PASS
+#echo
+
+# 6. Split with size strategy
+$SPLIT --split-max-size 500M $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-500M
+echo PASS
+echo
+
+# 6b. Test the sharded model is loading properly
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-500M-00001-of-00002.gguf -p "I believe the meaning of life is" --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
diff --git a/llama.cpp/tools/imatrix/CMakeLists.txt b/llama.cpp/tools/imatrix/CMakeLists.txt
new file mode 100644
index 0000000..5af6263
--- /dev/null
+++ b/llama.cpp/tools/imatrix/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(TARGET llama-imatrix)
+add_executable(${TARGET} imatrix.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
+
+if (CMAKE_SYSTEM_NAME MATCHES "AIX")
+    # AIX's flock() function comes from libbsd.a
+    target_link_libraries(${TARGET} PRIVATE -lbsd)
+endif()
diff --git a/llama.cpp/tools/imatrix/README.md b/llama.cpp/tools/imatrix/README.md
new file mode 100644
index 0000000..4505cb4
--- /dev/null
+++ b/llama.cpp/tools/imatrix/README.md
@@ -0,0 +1,98 @@
+# llama.cpp/tools/imatrix
+
+Compute an importance matrix for a model and given text dataset. Can be used during quantization to enhance the quality of the quantized models.
+More information is available in <https://github.com/ggml-org/llama.cpp/pull/4861>.
+
+## Usage
+
+```
+./llama-imatrix \
+    -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \
+    [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \
+    [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \
+    [--show-statistics] [...]
+```
+
+Here `-m | --model` with a model name and `-f | --file` with a file containing calibration data (such as e.g. `wiki.train.raw`) are mandatory.
+The parameters in square brackets are optional and have the following meaning:
+
+* `-h | --help` shows usage information and exits.
+* `-lv | --verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
+* `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used.
+* `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `--output-format` specifies the output format of the generated imatrix file. Either "gguf", or "dat" (the legacy format). Defaults to "gguf".
+* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
+* `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
+* `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets.
+* `--parse-special` enables parsing of special tokens (e.g., `<|im_start|>` in some models). Useful for models with custom tokenizers.
+* `--chunk | --from-chunk` to skip the first `n` chunks of tokens from the input data. Useful for resuming or skipping initial low-quality data.
+* `--chunks` maximum number of chunks to process. Default is -1 for all available chunks.
+* `--no-ppl` disables the calculation of perplexity for the processed chunks. Useful if you want to speed up the processing and do not care about perplexity.
+* `--show-statistics` displays imatrix file's statistics.
+
+For faster computation, make sure to use GPU offloading via the `-ngl | --n-gpu-layers` argument.
+
+Recent versions of `llama-imatrix` store data in GGUF format by default. For the legacy format, use an extension other than `.gguf` when saving the output file. More information is available in <https://github.com/ggml-org/llama.cpp/pull/9400>.
+
+## Examples
+
+```bash
+# generate importance matrix using default filename (imatrix.gguf), offloading 99 layers to GPU
+./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt -ngl 99
+
+# use the imatrix to perform a Q4_K_M quantization
+./llama-quantize --imatrix imatrix.gguf ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
+```
+
+```bash
+# generate and save the imatrix using legacy format
+./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --output-format dat -o imatrix-legcy-format.dat -ngl 99
+```
+
+```bash
+# convert legacy (binary) imatrix format to new (GGUF) format
+./llama-imatrix --in-file imatrix-legacy-format.dat -o imatrix-new-format.gguf
+```
+
+```bash
+# convert new (GGUF) imatrix format to legacy (binary) format
+./llama-imatrix --in-file imatrix-new-format.gguf --output-format dat -o imatrix-legacy-format.dat
+```
+
+```bash
+# combine existing imatrices
+./llama-imatrix --in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf -o imatrix-combined.gguf
+```
+
+```bash
+# skip first 5 chunks, save intermediates every 20 chunks and snapshots every 50, parsing special tokens
+./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --chunk 5 --output-frequency 20 --save-frequency 50 --parse-special
+```
+
+```bash
+# analyse imatrix file and display summary statistics instead of running inference
+./llama-imatrix --in-file imatrix.gguf --show-statistics
+```
+
+`--show-statistics` will display the following statistics:
+
+#### Per tensor
+
+* Σ(Act²): sum of all squared activations (the importance scores)
+* Min & Max: minimum and maximum squared activations values
+* μ & σ: Squared activations' mean and standard deviation
+* % Active: proportion of elements whose average squared activation exceeds a small threshold (1e-5). Helpful to determine how alive/dormant the tensor is during inference
+* N: number of squared activations
+* Entropy: entropy of the squared activation distribution, in bits (standard Shannon entropy measurement) $S = -\sum_{i=1}^N p_i \log_2 p_i$
+* E (norm): Normalized entropy. $E(norm)=\frac{-\sum_{i=1}^N p_i \log_2 p_i}{log_2 N}$. These two metrics can be used to determine how well a prompt "exercises" the model's capabilities
+* ZD Score: z-score distribution as described in _3.1 Layer Importance Scores_ of [Layer-Wise Quantization](https://arxiv.org/abs/2406.17415)
+* CosSim: cosine similarity with respect to the previous layer's tensor. Useful to determine how similar the squared activations of the current layer are to the previous layer's squared activations.
+
+#### Per layer
+
+Weighted averages of Σ(Act²), ZD Score and CosSim are also calculated.
+
+#### Important note on the computed Statistics
+
+When using these statistics, please note that they are computed on the squared activations, **not on the actual (raw) activations**.
+Whilst the results are still useful, they're less realiable than using the raw values, and in the case of the cosine similarity, could be misleading if the tensor contains opposite vectors.
diff --git a/llama.cpp/tools/imatrix/imatrix.cpp b/llama.cpp/tools/imatrix/imatrix.cpp
new file mode 100644
index 0000000..669de55
--- /dev/null
+++ b/llama.cpp/tools/imatrix/imatrix.cpp
@@ -0,0 +1,1302 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "gguf.h"
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <thread>
+#include <mutex>
+#include <vector>
+#include <fstream>
+#include <unordered_map>
+#include <map>
+#include <regex>
+#include <numeric>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \\\n"
+            "       [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n"
+            "       [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n"
+            "       [--show-statistics] [...]\n" , argv[0]);
+    LOG("\n");
+}
+
+static const char * const LLM_KV_IMATRIX_DATASETS    = "imatrix.datasets";
+static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
+static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
+
+struct Stats {
+    std::vector<float>   values;
+    std::vector<int64_t> counts;
+};
+
+struct tensor_statistics {
+    std::string tensor;
+    Stats stats;
+    float total_sqract = 0.0f;
+    float mean_sqract  = 0.0f;
+    float max_sqract   = 0.0f;
+    float min_sqract   = 0.0f;
+    int elements       = 0;
+    float stddev       = 0.0f;
+    float active       = 0.0f;
+    float entropy      = 0.0f;
+    float zd           = 0.0f;
+    float cossim       = 0.0f;
+};
+
+class IMatrixCollector {
+public:
+    IMatrixCollector() = default;
+    void set_params(common_params params) { m_params = std::move(params); }
+    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
+    void save_imatrix_legacy(int32_t ncall = -1) const;
+    void save_imatrix(int32_t n_chunk = -1) const;
+    bool load_imatrix_legacy(const char * fname);
+    bool load_imatrix(const char * file_name);
+    const std::unordered_map<std::string, Stats> & get_mstats() const { return m_stats; }
+private:
+    std::unordered_map<std::string, Stats> m_stats;
+    common_params                          m_params;
+    std::mutex                             m_mutex;
+    std::vector<std::string>               m_datasets;
+    int32_t                                m_last_chunk = 0;
+    std::vector<char>                      m_src1_data;
+    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
+};
+
+// remove any prefix and suffixes from the name
+// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
+static std::string filter_tensor_name(const char * name) {
+    std::string wname;
+    const char * p = strchr(name, '#');
+    if (p != NULL) {
+        p = p + 1;
+        const char * q = strchr(p, '#');
+        if (q != NULL) {
+            wname = std::string(p, q - p);
+        } else {
+            wname = p;
+        }
+    } else {
+        wname = name;
+    }
+    return wname;
+}
+
+static void process_tensor_name(const std::string & input, std::string & layer, std::string & tensor) {
+    std::vector<std::string> name;
+    std::istringstream stream(input);
+    std::string item;
+
+    while (std::getline(stream, item, '.')) {
+        name.push_back(item);
+    }
+    for (size_t i = 0; i < name.size(); ++i) {
+        if (name[i] == "blk" && i + 1 < name.size()) {
+            layer = name[i + 1];
+            break;
+        }
+    }
+    for (size_t i = 0; i < name.size(); ++i) {
+        if (name[i] == "weight" && i > 0) {
+            tensor = name[i - 1];
+            break;
+        }
+    }
+
+    if (tensor.empty()) {
+        tensor = input;
+    }
+    if (layer.empty()) {
+        layer = "-";
+    }
+}
+
+static void compute_statistics(std::vector<tensor_statistics> & tstats, const std::string & name, const Stats & e) {
+    if (e.values.size() % e.counts.size() != 0) {
+        LOG_ERR("%s: activation size mismatch for tensor %s (%zu vs %zu)\n", __func__, name.c_str(), e.counts.size(), e.values.size());
+        return;
+    }
+    if (e.counts.empty()) {
+        LOG_ERR("%s: there are no activations for tensor %s. The imatrix may be suboptimal\n", __func__, name.c_str());
+        return;
+    }
+
+    const int n_mat = e.counts.size();
+    const int row_size = e.values.size() / n_mat;
+
+    std::vector<float> activations;
+    activations.reserve(e.values.size());
+
+    for (int i = 0; i < n_mat; ++i) {
+        for (int j = 0; j < row_size; ++j) {
+            activations.push_back(e.values[i*row_size + j] / e.counts[i]);
+        }
+    }
+
+    const float act_total     = std::accumulate(activations.begin(), activations.end(), 0.0f);
+    const float act_max       = *std::max_element(activations.begin(), activations.end());
+    const float act_min       = *std::min_element(activations.begin(), activations.end());
+    const float act_mean      = act_total / activations.size();
+    const float act_sqr_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f);
+    const float act_var       = (act_sqr_total / activations.size()) - (act_mean * act_mean);
+    const float act_dev       = std::sqrt(std::max(0.0f, act_var));
+    float threshold           = 1e-5f;
+    const int inactive_count  = std::count_if(activations.begin(), activations.end(),
+                                               [threshold](const float v) { return fabsf(v) <= threshold; });
+    const float active_ratio  = 1 - static_cast<float>(inactive_count) / activations.size();
+
+    float entropy = 0;
+    if (act_total > 0) {
+        for (const auto act : activations) {
+            if (const float p = act / act_total; p > 0) {
+                entropy -= p * std::log2(p);
+            }
+        }
+    }
+
+    int z_score = 0;
+    if (act_dev > 0.0f) {
+        for (const auto act : activations) {
+            if (const float p = (act - act_mean) / act_dev; p > 1) {
+                z_score++;
+            }
+        }
+    }
+
+    auto & ts = tstats.emplace_back();
+    ts.tensor     = name;
+    ts.stats      = e;
+    ts.total_sqract = act_total;
+    ts.mean_sqract  = act_mean;
+    ts.max_sqract   = act_max;
+    ts.min_sqract   = act_min;
+    ts.elements   = static_cast<int>(activations.size());
+    ts.stddev     = act_dev;
+    ts.active     = active_ratio;
+    ts.entropy    = entropy;
+    ts.zd         = static_cast<float>(z_score) / ts.elements;
+}
+
+static void compute_cossim(std::vector<tensor_statistics> & tstats) {
+    static const std::regex pattern(R"(blk\.(\d+)\.)");
+    for (auto & ts : tstats) {
+        if (std::smatch match; std::regex_search(ts.tensor, match, pattern)) {
+            const int blk = std::stoi(match[1]);
+            std::string tname(ts.tensor);
+            tname.replace(match.position(1), match.length(1), std::to_string(blk-1));
+            auto prev = std::find_if(tstats.begin(), tstats.end(),
+                [tname](const tensor_statistics & t) { return t.tensor == tname; });
+            if (prev != tstats.end()) {
+                const float dp = std::inner_product(ts.stats.values.begin(), ts.stats.values.end(),
+                    prev->stats.values.begin(), 0.0f);
+                const float curr_mag = std::sqrt(std::inner_product(ts.stats.values.begin(), ts.stats.values.end(),
+                    ts.stats.values.begin(), 0.0f));
+                const float prev_mag = std::sqrt(std::inner_product(prev->stats.values.begin(), prev->stats.values.end(),
+                    prev->stats.values.begin(), 0.0f));
+                const float cs = dp / (curr_mag * prev_mag);
+                ts.cossim = cs;
+            }
+        } else {
+            ts.cossim = 0;
+        }
+    }
+}
+
+bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
+    GGML_UNUSED(user_data);
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+    std::string wname = filter_tensor_name(src0->name);
+
+    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
+
+    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
+    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
+    if (ask) {
+        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
+        if (t->op != GGML_OP_MUL_MAT) return false;
+        // why are small batches ignored (<16 tokens)?
+        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
+        if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
+        return true;
+    }
+
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    // copy the data from the GPU memory if needed
+    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
+
+    if (!is_host) {
+        const size_t src1_nbytes = ggml_nbytes(src1);
+        m_src1_data.resize(src1_nbytes);
+        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, src1_nbytes);
+    }
+
+    const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
+    GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
+
+    // this has been adapted to the new format of storing merged experts in a single 3d tensor
+    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
+    if (t->op == GGML_OP_MUL_MAT_ID) {
+        //   ids  -> [n_experts_used, n_tokens]
+        //   src1 -> [cols, n_expert_used, n_tokens]
+        const ggml_tensor * ids = t->src[2];
+        const int64_t n_as = src0->ne[2];
+        const int64_t n_ids = ids->ne[0];
+
+        // the top-k selected expert ids are stored in the ids tensor
+        // for simplicity, always copy ids to host, because it is small
+        // take into account that ids is not contiguous!
+
+        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
+
+        // the extra dimension would need to be stored somewhere to be reflected in the imatrix file
+        if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
+            LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
+            GGML_ASSERT(false);
+        }
+
+        m_ids.resize(ggml_nbytes(ids));
+        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
+
+        auto & e = m_stats[wname];
+
+        if (e.counts.size() == 1 && n_as > 1) {
+            // broadcast, when loading an old imatrix
+            e.counts.resize(n_as, e.counts[0]);
+        }
+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0]*n_as, 0);
+            e.counts.resize(n_as, 0);
+        }
+        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0]*n_as));
+            exit(1); //GGML_ABORT("fatal error");
+        }
+        else if (e.counts.size() != (size_t)n_as) {
+            LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_as);
+            exit(1); //GGML_ABORT("fatal error");
+        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+        // loop over all possible experts, regardless if they are used or not in the batch
+        for (int64_t ex = 0; ex < n_as; ++ex) {
+            size_t e_start = ex*src1->ne[0];
+
+            for (int64_t idx = 0; idx < n_ids; ++idx) {
+                for (int64_t row = 0; row < src1->ne[2]; ++row) {
+                    const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
+
+                    GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
+
+                    if (excur != ex) continue;
+
+                    const int64_t i11 = idx % src1->ne[1];
+                    const int64_t i12 = row;
+                    const float * x = (const float *)(data + i11*src1->nb[1] + i12*src1->nb[2]);
+
+                    e.counts[ex]++;
+
+                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
+                        e.values[e_start + j] += x[j] * x[j];
+                        if (!std::isfinite((float)e.values[e_start + j])) {
+                            LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str());
+                            exit(1);
+                        }
+                    }
+                }
+            }
+            const int32_t n_chunk = e.counts[ex] / chunk_size;
+            if (n_chunk > m_last_chunk) {
+                const int32_t chunk_step = n_chunk - m_last_chunk;
+                m_last_chunk = n_chunk;
+                if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
+                    save_imatrix();
+                }
+                if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
+                    save_imatrix(m_last_chunk);
+                }
+            }
+        }
+    } else {
+        auto & e = m_stats[wname];
+        const int64_t n_mat = src0->ne[2] * src0->ne[3];
+
+        // use a single count per dense tensor
+        // (necessary when merging older GGUF-imatrix files with 3d tensors)
+        if (e.counts.size() > 1) {
+            bool all_equal = true;
+            for (size_t i = 1; i < e.counts.size(); ++i) {
+                if (e.counts[0] != e.counts[i]) {
+                    all_equal = false;
+                    break;
+                }
+            }
+            if (all_equal) {
+                e.counts.resize(1);
+            }
+        }
+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0] * n_mat, 0);
+            e.counts.resize(1, 0);
+        }
+        else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
+            exit(1); //GGML_ABORT("fatal error");
+        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
+
+        for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
+            for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
+                // handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D
+                const int64_t mat_id = (i3 % src0->ne[3]) * src0->ne[2] + (i2 % src0->ne[2]);
+                const int64_t mat_start = mat_id * src1->ne[0];
+
+                for (int64_t row = 0; row < src1->ne[1]; ++row) {
+                    const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->nb[3]);
+                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
+                        e.values[mat_start + j] += x[j] * x[j];
+                        if (!std::isfinite((float)e.values[j])) {
+                            LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str());
+                            exit(1);
+                        }
+                    }
+                }
+            }
+        }
+        // only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT
+        for (size_t i = 0; i < e.counts.size(); ++i) {
+            e.counts[i] += ggml_nrows(src1) / n_mat;
+            const int32_t n_chunk = e.counts[i] / chunk_size;
+            if (n_chunk > m_last_chunk) {
+                const int32_t chunk_step = n_chunk - m_last_chunk;
+                m_last_chunk = n_chunk;
+                if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
+                    save_imatrix();
+                }
+                if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
+                    save_imatrix(m_last_chunk);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {
+    auto fname = m_params.out_file;
+
+    if (ncall > 0) {
+        fname += ".at_";
+        fname += std::to_string(ncall);
+    }
+
+    // warn when writing imatrix entries that do not have full data
+    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
+
+    int n_entries = 0;
+    std::vector<std::string> to_store;
+
+    bool is_first = true; // for printing
+    for (const auto & kv : m_stats) {
+        const int n_all = kv.second.counts.size();
+
+        if (n_all == 0) {
+            continue;
+        }
+
+        int n_zeros = 0;
+        for (const int c : kv.second.counts) {
+            if (c == 0) {
+                n_zeros++;
+            }
+        }
+
+        if (n_zeros != 0 && is_first) {
+            LOG_INF("\n");
+            is_first = false;
+        }
+
+        if (n_zeros == n_all) {
+            LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
+            continue;
+        }
+
+        if (n_zeros > 0) {
+            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+        }
+
+        n_entries++;
+        to_store.push_back(kv.first);
+    }
+
+    if (to_store.size() < m_stats.size()) {
+        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
+    }
+
+    // deterministic tensor name order
+    std::sort(to_store.begin(), to_store.end());
+
+    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
+
+    std::ofstream out(fname, std::ios::binary);
+    out.write((const char *) &n_entries, sizeof(n_entries));
+    for (const auto & name : to_store) {
+        const auto & stat = m_stats.at(name);
+        const int32_t len = name.size();
+        out.write((const char *) &len, sizeof(len));
+        out.write(name.c_str(), len);
+        // ceiling division to avoid accidental zeros
+        const int32_t ncall = (*std::max_element(stat.counts.begin(), stat.counts.end()) + (chunk_size - 1)) / chunk_size;
+        out.write((const char *) &ncall, sizeof(ncall));
+        const int32_t nval = stat.values.size();
+        const int32_t nmat = stat.counts.size();
+        out.write((const char *) &nval, sizeof(nval));
+        if (nval > 0 && nmat > 0) {
+            std::vector<float> tmp(nval);
+            for (int32_t i = 0; i < nval; i++) {
+                float count = static_cast<float>(stat.counts[i / (nval / nmat)]);
+                float value = stat.values[i];
+                if (count == 0.0f) {
+                    // store 1 for partial data
+                    value = 1.0f;
+                    count = 1.0f;
+                }
+                tmp[i] = (value / count) * static_cast<float>(ncall);
+            }
+            out.write((const char *) tmp.data(), nval * sizeof(float));
+        }
+    }
+
+    // Write the number of call the matrix was computed with
+    out.write((const char *) &m_last_chunk, sizeof(m_last_chunk));
+
+    // Write the input filename at the end of the file to later on specify it in quantize
+    {
+        const char * dataset_file = m_params.prompt_file.c_str();
+        int32_t len = m_params.prompt_file.size();
+        // When there is no prompt but there were other imatrix files loaded, use the last dataset
+        if (m_params.prompt_file.empty() && !m_datasets.empty()) {
+            const std::string & dataset_str = m_datasets[m_datasets.size() - 1];
+            dataset_file = dataset_str.c_str();
+            len = dataset_str.size();
+        }
+        out.write((const char *) &len, sizeof(len));
+        out.write(dataset_file, len);
+    }
+
+    LOGV(1, "\n");
+    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
+}
+
+void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
+    auto fname = m_params.out_file;
+    int8_t use_legacy_format = m_params.imat_dat;
+
+    if (use_legacy_format > 0) {
+        this->save_imatrix_legacy(n_chunk);
+        return;
+    }
+    // only warn when `--output-format gguf` is not specified
+    if (use_legacy_format == 0 && !string_ends_with(fname, ".gguf")) {
+        LOG_WRN("\n%s: saving imatrix using GGUF format with a different suffix than .gguf\n", __func__);
+        LOG_WRN("%s: if you want the previous imatrix format, use --output-format dat\n", __func__);
+    }
+
+    if (n_chunk > 0) {
+        fname += ".at_";
+        fname += std::to_string(n_chunk);
+    }
+
+    // write imatrix entries even if they don't have full data. (can be corrected when reading)
+    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
+
+    std::vector<std::string> to_store;
+    size_t data_size = 0;
+
+    bool is_first = true; // for printing
+    for (const auto & kv : m_stats) {
+        const int n_all = kv.second.counts.size();
+
+        int n_zeros = 0;
+        for (const auto c : kv.second.counts) {
+            if (c == 0) {
+                n_zeros++;
+            }
+        }
+
+        if (n_zeros != 0 && is_first) {
+            LOG_INF("\n");
+            is_first = false;
+        }
+
+        if (n_zeros > 0) {
+            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+        }
+
+        to_store.push_back(kv.first);
+        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN);
+        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN);
+    }
+
+    // deterministic tensor name order
+    std::sort(to_store.begin(), to_store.end());
+
+    struct ggml_init_params params = {
+        /* .mem_size   = */ data_size,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+    struct gguf_context * ctx_gguf = gguf_init_empty();
+
+    {
+        std::vector<const char *> datasets;
+        datasets.reserve(m_datasets.size() + 1);
+        for (size_t i = 0; i < m_datasets.size(); ++i) {
+            datasets.push_back(m_datasets[i].c_str());
+        }
+        if (!m_params.prompt_file.empty()) {
+            datasets.push_back(m_params.prompt_file.c_str());
+        }
+
+        gguf_set_val_str(ctx_gguf, "general.type", "imatrix");
+        // Write the dataset paths
+        gguf_set_arr_str(ctx_gguf, LLM_KV_IMATRIX_DATASETS, datasets.data(), datasets.size());
+        // Write the number of chunks the matrix was computed with
+        gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT, m_last_chunk);
+        gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE, m_params.n_ctx / m_params.n_parallel);
+    }
+
+    for (const auto & name : to_store) {
+        const auto & stat = m_stats.at(name);
+        const int32_t nval = (int32_t) stat.values.size();
+        const int32_t nmat = (int32_t) stat.counts.size();
+        if (nval > 0 && nmat > 0) {
+            struct ggml_tensor * in_sum2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nval / nmat, nmat);
+            struct ggml_tensor * counts  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, nmat);
+            ggml_format_name(in_sum2, "%s.in_sum2", name.c_str());
+            ggml_format_name(counts, "%s.counts", name.c_str());
+
+            for (int32_t j = 0; j < nval; ++j) {
+                ((float *) in_sum2->data)[j] = (float) stat.values[j];
+            }
+            for (int32_t j = 0; j < nmat; ++j) {
+                ((float *) counts->data)[j] = (float) stat.counts[j];
+            }
+
+            gguf_add_tensor(ctx_gguf, in_sum2);
+            gguf_add_tensor(ctx_gguf, counts);
+        }
+    }
+
+    gguf_write_to_file(ctx_gguf, fname.c_str(), false);
+
+    LOGV(1, "\n");
+    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+}
+
+bool IMatrixCollector::load_imatrix_legacy(const char * fname) {
+    std::ifstream in(fname, std::ios::binary);
+    if (!in) {
+        LOG_ERR("%s: failed to open %s\n", __func__, fname);
+        return false;
+    }
+    int n_entries;
+    in.read((char *) &n_entries, sizeof(n_entries));
+    if (in.fail() || n_entries < 1) {
+        LOG_ERR("%s: no data in file %s\n", __func__, fname);
+        return false;
+    }
+    // Guess the chunk size because it's not stored in the file
+    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
+
+    for (int i = 0; i < n_entries; ++i) {
+        int32_t len = 0;
+        in.read((char *) &len, sizeof(len));
+        std::vector<char> name_as_vec(len + 1);
+        in.read((char *) name_as_vec.data(), len);
+        if (in.fail()) {
+            LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname);
+            return false;
+        }
+        name_as_vec[len] = 0;
+        std::string name{ name_as_vec.data() };
+        auto & e = m_stats[std::move(name)];
+        int32_t ncall = 0;
+        in.read((char *) &ncall, sizeof(ncall));
+        int32_t nval = 0;
+        in.read((char *) &nval, sizeof(nval));
+        if (in.fail() || nval < 1) {
+            LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i);
+            m_stats = {};
+            return false;
+        }
+
+        if (e.values.empty()) {
+            e.values.resize(nval, 0.0f);
+            e.counts.resize(1, 0);
+        }
+
+        std::vector<float> tmp(nval);
+        in.read((char *) tmp.data(), nval * sizeof(float));
+        if (in.fail()) {
+            LOG_ERR("%s: failed reading data for entry %d\n", __func__, i);
+            m_stats = {};
+            return false;
+        }
+
+        // Recreate the state as expected by save_imatrix(), and correct for weighted sum.
+        for (int i = 0; i < nval; i++) {
+            e.values[i] += tmp[i] * chunk_size;
+        }
+        // The legacy format doesn't distinguish the counts for different experts
+        for (size_t j = 0; j < e.counts.size(); ++j) {
+            e.counts[j] += ncall * chunk_size;
+        }
+    }
+
+    {
+        // TODO: extract into its own method; this is also used by the GGUF-based format
+        // Calculate the last chunk count
+        int64_t max_count = 0;
+        for (const auto & stats : m_stats) {
+            for (int64_t count : stats.second.counts) {
+                if (count > max_count) {
+                    max_count = count;
+                }
+            }
+        }
+        m_last_chunk = max_count / (chunk_size);
+    }
+
+    {
+        // Read the number of calls the matrix was computed with
+        int32_t n_calls;
+        in.read((char *) &n_calls, sizeof(n_calls));
+        // ignore it because it's not important
+    }
+
+    // Read the dataset path to include it when writing to GGUF
+    if (!in.fail()){
+        int32_t len = 0;
+        in.read((char *) &len, sizeof(len));
+        if (!in.fail()) {
+            std::vector<char> dataset;
+            dataset.resize(len + 1, 0);
+            in.read(dataset.data(), len);
+            if (!in.fail()) {
+                m_datasets.push_back(dataset.data());
+            }
+        }
+    }
+
+    return true;
+}
+
+// Using GGUF as the file format, for greater extensibility
+bool IMatrixCollector::load_imatrix(const char * file_name) {
+    struct ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false, // the data is needed
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params);
+    if (!ctx_gguf) {
+        return this->load_imatrix_legacy(file_name);
+    }
+    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
+    if (n_entries < 1) {
+        LOG_ERR("%s: no data in file %s\n", __func__, file_name);
+        gguf_free(ctx_gguf);
+        ggml_free(ctx);
+        return false;
+    }
+
+    const int64_t datasets_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS);
+    if (datasets_key != -1 && gguf_get_arr_type(ctx_gguf, datasets_key) == GGUF_TYPE_STRING) {
+        const int64_t n = gguf_get_arr_n(ctx_gguf, datasets_key);
+        m_datasets.reserve(m_datasets.size() + n);
+        for (int64_t i = 0; i < n; ++i) {
+            m_datasets.push_back(gguf_get_arr_str(ctx_gguf, datasets_key, i));
+        }
+    }
+
+    const std::string in_sum2_suffix{ ".in_sum2" };
+    const std::string counts_suffix{ ".counts" };
+
+    // Could re-use m_stats instead, but this allows
+    // checking for completeness of *each* loaded imatrix file
+    // and also makes it easier to re-use a similar implementation in quantize.cpp
+    // Using an ordered map to get a deterministic iteration order.
+    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+
+    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+        std::string name = cur->name;
+
+        if (name.empty()) { continue; }
+
+        if (string_remove_suffix(name, in_sum2_suffix)) {
+            // in_sum2
+            sums_counts_for[std::move(name)].first = cur;
+        } else if (string_remove_suffix(name, counts_suffix)) {
+            // counts
+            sums_counts_for[std::move(name)].second = cur;
+        } else {
+            // ignore other tensors
+        }
+    }
+
+    for (const auto & sc : sums_counts_for) {
+        const std::string &        name    = sc.first;
+        const struct ggml_tensor * in_sum2 = sc.second.first;
+        const struct ggml_tensor * counts  = sc.second.second;
+
+        if (!in_sum2 || !counts) {
+            LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            return false;
+        }
+
+        auto & e = m_stats[name];
+
+        int64_t nval = ggml_nelements(in_sum2);
+        if (e.values.empty()) {
+            e.values.resize(nval, 0.0f);
+        } else if ((size_t) nval != e.values.size()) {
+            LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            return false;
+        }
+
+        int64_t ncounts = ggml_nelements(counts);
+        if (e.counts.empty()) {
+            e.counts.resize(ncounts, 0);
+        } else if (e.counts.size() == 1 && ncounts > 1) {
+            // broadcast, when loading an old imatrix
+            e.counts.resize(ncounts, e.counts[0]);
+        } else if ((size_t) ncounts != e.counts.size()) {
+            LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            return false;
+        }
+
+        // Recreate the state as expected by save_imatrix()
+        for (int64_t j = 0; j < nval; j++) {
+            e.values[j] += ((const float *) in_sum2->data)[j];
+        }
+        for (int64_t j = 0; j < ncounts; j++) {
+            e.counts[j] += std::lround(((const float *) counts->data)[j]);
+        }
+    }
+
+    // TODO: extract into its own method; this is also used by the legacy format
+    // Calculate the last chunk count
+    int64_t max_count = 0;
+    for (const auto & stats : m_stats) {
+        for (int64_t count : stats.second.counts) {
+            if (count > max_count) {
+                max_count = count;
+            }
+        }
+    }
+    m_last_chunk = max_count / (m_params.n_ctx / m_params.n_parallel);
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+    return true;
+}
+
+static IMatrixCollector g_collector;
+
+static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
+    return g_collector.collect_imatrix(t, ask, user_data);
+}
+
+struct results_log_softmax {
+    double log_softmax;
+    float  logit;
+    float  prob;
+};
+
+static std::vector<float> softmax(const std::vector<float> & logits) {
+    std::vector<float> probs(logits.size());
+    float max_logit = logits[0];
+    for (float v : logits) {
+        max_logit = std::max(max_logit, v);
+    }
+    double sum_exp = 0.0;
+    for (size_t i = 0; i < logits.size(); i++) {
+        // Subtract the maximum logit value from the current logit value for numerical stability
+        const float logit = logits[i] - max_logit;
+        const float exp_logit = expf(logit);
+        sum_exp += exp_logit;
+        probs[i] = exp_logit;
+    }
+    for (size_t i = 0; i < probs.size(); i++) {
+        probs[i] /= sum_exp;
+    }
+    return probs;
+}
+
+static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
+    float max_logit = logits[0];
+    for (int i = 1; i < n_vocab; ++i) {
+        max_logit = std::max(max_logit, logits[i]);
+    }
+    double sum_exp = 0.0;
+    for (int i = 0; i < n_vocab; ++i) {
+        sum_exp += expf(logits[i] - max_logit);
+    }
+    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
+}
+
+static void process_logits(
+    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
+    double & nll, double & nll2, float * logit_history, float * prob_history) {
+    std::mutex mutex;
+    int counter = 0;
+    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
+        double local_nll  = 0;
+        double local_nll2 = 0;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int i = counter++;
+            if (i >= n_token) {
+                nll += local_nll; nll2 += local_nll2;
+                break;
+            }
+            lock.unlock();
+            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
+            const double v = -results.log_softmax;
+            local_nll += v;
+            local_nll2 += v*v;
+
+            logit_history[i] = results.logit;
+            prob_history[i]  = results.prob;
+        }
+    };
+    for (auto & w : workers) {
+        w = std::thread(compute);
+    }
+    compute();
+    for (auto & w : workers) {
+        w.join();
+    }
+}
+
+static bool compute_imatrix(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
+
+    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+
+    auto tim1 = std::chrono::high_resolution_clock::now();
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
+
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true, params.parse_special);
+
+    auto tim2 = std::chrono::high_resolution_clock::now();
+    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+
+    if (params.i_chunk > 0) {
+        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
+            LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
+            return false;
+        }
+        LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
+    }
+
+    if (int(tokens.size()) < 2*n_ctx) {
+        LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
+        return false;
+    }
+
+    std::vector<float> logit_history;
+    std::vector<float> prob_history;
+
+    if (params.compute_ppl) {
+        logit_history.resize(tokens.size());
+        prob_history.resize(tokens.size());
+    }
+
+    const int n_chunk_max = tokens.size() / n_ctx;
+
+    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+    const int n_batch = params.n_batch;
+
+    int count = 0;
+    double nll = 0.0;
+    double nll2 = 0.0;
+
+    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+    const int n_seq = std::max(1, n_batch / n_ctx);
+
+    GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
+    GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
+
+    llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
+
+    std::vector<float> logits;
+    if (params.compute_ppl && num_batches > 1) {
+        logits.reserve((size_t)n_ctx * n_vocab);
+    }
+
+    LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
+
+    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
+
+    for (int i = 0; i < n_chunk; i += n_seq) {
+        const int start =     i * n_ctx;
+        const int end   = start + n_ctx;
+
+        const int n_seq_batch = std::min(n_seq, n_chunk - i);
+
+        const auto t_start = std::chrono::high_resolution_clock::now();
+
+        // clear the KV cache
+        llama_memory_clear(llama_get_memory(ctx), true);
+
+        for (int j = 0; j < num_batches; ++j) {
+            const int batch_start = start + j * n_batch;
+            const int batch_size  = std::min(end - batch_start, n_batch);
+
+            // clear the batch
+            common_batch_clear(batch);
+
+            for (int seq = 0; seq < n_seq_batch; seq++) {
+                int seq_start = batch_start + seq*n_ctx;
+
+                // save original token and restore it after eval
+                const auto token_org = tokens[seq_start];
+
+                // add BOS token for the first batch of each chunk
+                if (add_bos && j == 0) {
+                    tokens[seq_start] = llama_vocab_bos(vocab);
+                }
+                for (int k = 0; k < batch_size; ++k) {
+                    // NOTE: specifying all logits to get activations for the output.weight tensor
+                    //       and also for the perplexity calculation.
+                    // TODO: only get outputs when (params.process_output || params.compute_ppl)
+                    //       (not possible when this skips FFN computation of the last layer)
+                    common_batch_add(batch, tokens[seq_start + k], j*n_batch + k, { seq }, true);
+                }
+
+                // restore the original token in case it was set to BOS
+                tokens[seq_start] = token_org;
+            }
+
+            if (llama_decode(ctx, batch)) {
+                LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
+                return false;
+            }
+
+            if (params.compute_ppl && num_batches > 1) {
+                const auto * batch_logits = llama_get_logits(ctx);
+                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            }
+        }
+
+
+        if (i == 0) {
+            llama_synchronize(ctx);
+            const auto t_end = std::chrono::high_resolution_clock::now();
+            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            int total_seconds = (int)(t_total * n_chunk / n_seq);
+            if (total_seconds >= 60*60) {
+                LOG("%d hours ", total_seconds / (60*60));
+                total_seconds = total_seconds % (60*60);
+            }
+            LOG("%.2f minutes\n", total_seconds / 60.0);
+        }
+
+        if (params.compute_ppl) {
+            const int first = n_ctx/2;
+            for (int seq = 0; seq < n_seq_batch; seq++) {
+                const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx);
+
+                llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
+
+                process_logits(n_vocab, all_logits + first*n_vocab,
+                        tokens_data, n_ctx - 1 - first,
+                        workers, nll, nll2,
+                        logit_history.data() + start + seq*n_ctx + first,
+                        prob_history.data()  + start + seq*n_ctx + first);
+
+                count += n_ctx - first - 1;
+
+                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+            }
+            fflush(stdout);
+
+            logits.clear();
+        }
+    }
+
+    LOG("\n");
+
+    if (params.compute_ppl) {
+        nll2 /= count;
+        nll /= count;
+        const double ppl = exp(nll);
+        nll2 -= nll * nll;
+        if (nll2 > 0) {
+            nll2 = sqrt(nll2/(count-1));
+            LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+        } else {
+            LOG("Unexpected negative standard deviation of log(prob)\n");
+        }
+    }
+
+    llama_batch_free(batch);
+
+    return true;
+}
+
+static bool show_statistics(const common_params & params) {
+    std::vector<tensor_statistics> ts;
+    if (params.in_files.empty() || params.in_files.size() > 1) {
+        LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n");
+        return false;
+    }
+    if (g_collector.load_imatrix(params.in_files[0].c_str())) {
+        for (const auto & [name, stats] :g_collector.get_mstats()) {
+            compute_statistics(ts, name, stats);
+        }
+    } else {
+        LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str());
+        return false;
+    }
+    if (!ts.empty()) {
+        compute_cossim(ts);
+    } else {
+        LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str());
+        return false;
+    }
+
+    struct tensor_comparer {
+        bool operator()(const tensor_statistics & a, const tensor_statistics & b) const {
+            std::string layer, name_a, name_b;
+            ;
+            process_tensor_name(a.tensor, layer, name_a);
+            process_tensor_name(b.tensor, layer, name_b);
+            return name_a < name_b || (name_a == name_b && a.total_sqract > b.total_sqract);
+        }
+    };
+    std::sort(ts.begin(), ts.end(), tensor_comparer());
+
+    struct weighted_stats {
+        float weighted_bias   = 0.0f;
+        float weighted_zd     = 0.0f;
+        float weighted_cossim = 0.0f;
+        int   total_elements  = 0;
+    };
+    std::map<int, weighted_stats> ws;
+
+    LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast<int>(ts.size()));
+    LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", " Layer", "       Tensor", "          Σ(Act²)",
+            "  Min", "            Max", "           μ", "   σ", " % Active", "N", "   Entropy", "E (norm)", "ZD",
+            "  CosSim");
+    LOG_INF(
+        "=============================================================================================================="
+        "===========================================================\n");
+    for (const auto & tstat : ts) {
+        std::string layer, name;
+        process_tensor_name(tstat.tensor, layer, name);
+
+        int blk;
+        try {
+            blk = std::stoi(layer);
+        } catch (const std::exception & e) {
+            blk = -1;  // not a block layer
+        }
+
+        LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n",
+                layer.c_str(), name.c_str(), tstat.total_sqract, tstat.min_sqract, tstat.max_sqract, tstat.mean_sqract,
+                tstat.stddev, tstat.active * 100.0f, tstat.elements, tstat.entropy,
+                100.0f * (tstat.entropy / std::log2(tstat.elements)), 100.0f * tstat.zd, tstat.cossim);
+
+        const float weighted_bias   = tstat.elements * tstat.total_sqract;
+        const float weighted_zd     = tstat.elements * tstat.zd;
+        const float weighted_cossim = tstat.elements * tstat.cossim;
+
+        if (ws.find(blk) != ws.end()) {
+            ws[blk].weighted_bias += weighted_bias;
+            ws[blk].weighted_zd += weighted_zd;
+            ws[blk].weighted_cossim += weighted_cossim;
+            ws[blk].total_elements += tstat.elements;
+        } else {
+            weighted_stats temp_ws;
+            temp_ws.weighted_bias   = weighted_bias;
+            temp_ws.weighted_zd     = weighted_zd;
+            temp_ws.weighted_cossim = weighted_cossim;
+            temp_ws.total_elements  = tstat.elements;
+            ws[blk]                 = temp_ws;
+        }
+    }
+
+    const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; });
+    LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers);
+    LOG_INF("\n%s\t%s\t%s\t%s\n", "  Layer", "     μΣ(Act²)", "      μZD", "μCosSim");
+    LOG_INF("================================================\n");
+    for (const auto & [first, second] : ws) {
+        const auto & layer = first;
+        const auto & stats = second;
+
+        if (stats.total_elements == 0) {
+            continue;
+        }
+
+        if (layer >= 0) {
+            const float bias   = stats.weighted_bias / stats.total_elements;
+            const float zd     = stats.weighted_zd / stats.total_elements;
+            const float cossim = stats.weighted_cossim / stats.total_elements;
+
+            LOG_INF("%5d\t%14.2f\t%10.4f%%\t%6.4f\n", layer, bias, 100.0f * zd, cossim);
+        }
+    }
+    LOG_INF("\n");
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.out_file = "imatrix.gguf";
+
+    params.n_ctx = 512;
+    params.escape = false;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
+        return 1;
+    }
+
+    if (params.show_statistics) {
+        if (!show_statistics(params)) {
+            return 1;
+        }
+        return 0;
+    }
+
+    common_init();
+
+    const int32_t n_ctx = params.n_ctx;
+
+    if (n_ctx <= 0) {
+        LOG_ERR("%s: imatrix tool requires '--ctx-size' > 0\n", __func__);
+        return 1;
+    }
+
+    {
+        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
+        const int32_t n_kv = n_seq * n_ctx;
+
+        params.n_parallel = n_seq;
+        params.n_ctx      = n_kv;
+
+        params.n_batch = std::min(params.n_batch, n_kv);
+    }
+
+    g_collector.set_params(params);
+
+    for (const auto & in_file : params.in_files) {
+        LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
+        if (!g_collector.load_imatrix(in_file.c_str())) {
+            LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
+            return 1;
+        }
+    }
+
+    if (params.prompt.empty()) {
+        LOG_INF("No prompt provided; combining precomputed matrices only.\n");
+
+        if (params.in_files.empty()) {
+            LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
+            return 1;
+        }
+
+        if (params.in_files.size() == 1) {
+            LOG_INF("%s : saving imatrix to '%s'\n", __func__, params.out_file.c_str());
+        } else if (params.in_files.size() > 1) {
+            LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
+        }
+
+        g_collector.save_imatrix();
+
+        return 0;
+    }
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // pass the callback to the backend scheduler
+    // it will be executed for each node during the graph computation
+    params.cb_eval = ik_collect_imatrix;
+    params.cb_eval_user_data = NULL;
+    params.warmup = false;
+
+    // init
+    auto llama_init = common_init_from_params(params);
+
+    auto * model = llama_init->model();
+    auto * ctx   = llama_init->context();
+
+    if (model == nullptr || ctx == nullptr) {
+        LOG_ERR("%s : failed to init\n", __func__);
+        return 1;
+    }
+
+    const int n_ctx_train = llama_model_n_ctx_train(model);
+    if (params.n_ctx > n_ctx_train) {
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, params.n_ctx);
+    }
+
+    // print system information
+    {
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    }
+
+    if (!compute_imatrix(ctx, params, n_ctx)) {
+        return 1;
+    }
+
+    g_collector.save_imatrix();
+
+    LOG("\n");
+    llama_perf_context_print(ctx);
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/llama.cpp/tools/llama-bench/CMakeLists.txt b/llama.cpp/tools/llama-bench/CMakeLists.txt
new file mode 100644
index 0000000..b8543a9
--- /dev/null
+++ b/llama.cpp/tools/llama-bench/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-bench)
+add_executable(${TARGET} llama-bench.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/llama.cpp/tools/llama-bench/README.md b/llama.cpp/tools/llama-bench/README.md
new file mode 100644
index 0000000..c837bb6
--- /dev/null
+++ b/llama.cpp/tools/llama-bench/README.md
@@ -0,0 +1,349 @@
+# llama.cpp/tools/llama-bench
+
+Performance testing tool for llama.cpp.
+
+## Table of contents
+
+1. [Syntax](#syntax)
+2. [Examples](#examples)
+    1. [Text generation with different models](#text-generation-with-different-models)
+    2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes)
+    3. [Different numbers of threads](#different-numbers-of-threads)
+    4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu)
+3. [Output formats](#output-formats)
+    1. [Markdown](#markdown)
+    2. [CSV](#csv)
+    3. [JSON](#json)
+    4. [JSONL](#jsonl)
+    5. [SQL](#sql)
+
+## Syntax
+
+```
+usage: llama-bench [options]
+
+options:
+  -h, --help
+  --numa <distribute|isolate|numactl>       numa mode (default: disabled)
+  -r, --repetitions <n>                     number of times to repeat each test (default: 5)
+  --prio <0|1|2|3>                          process/thread priority (default: 0)
+  --delay <0...N> (seconds)                 delay between each test (default: 0)
+  -o, --output <csv|json|jsonl|md|sql>      output format printed to stdout (default: md)
+  -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
+  --list-devices                            list available devices and exit
+  -v, --verbose                             verbose output
+  --progress                                print test progress indicators
+  -rpc, --rpc <rpc_servers>                 register RPC devices (comma separated)
+
+test parameters:
+  -m, --model <filename>                    (default: models/7B/ggml-model-q4_0.gguf)
+  -p, --n-prompt <n>                        (default: 512)
+  -n, --n-gen <n>                           (default: 128)
+  -pg <pp,tg>                               (default: )
+  -d, --n-depth <n>                         (default: 0)
+  -b, --batch-size <n>                      (default: 2048)
+  -ub, --ubatch-size <n>                    (default: 512)
+  -ctk, --cache-type-k <t>                  (default: f16)
+  -ctv, --cache-type-v <t>                  (default: f16)
+  -t, --threads <n>                         (default: system dependent)
+  -C, --cpu-mask <hex,hex>                  (default: 0x0)
+  --cpu-strict <0|1>                        (default: 0)
+  --poll <0...100>                          (default: 50)
+  -ngl, --n-gpu-layers <n>                  (default: 99)
+  -ncmoe, --n-cpu-moe <n>                   (default: 0)
+  -sm, --split-mode <none|layer|row>        (default: layer)
+  -mg, --main-gpu <i>                       (default: 0)
+  -nkvo, --no-kv-offload <0|1>              (default: 0)
+  -fa, --flash-attn <0|1>                   (default: 0)
+  -dev, --device <dev0/dev1/...>            (default: auto)
+  -mmp, --mmap <0|1>                        (default: 1)
+  -embd, --embeddings <0|1>                 (default: 0)
+  -ts, --tensor-split <ts0/ts1/..>          (default: 0)
+  -ot --override-tensors <tensor name pattern>=<buffer type>;...
+                                            (default: disabled)
+  -nopo, --no-op-offload <0|1>              (default: 0)
+
+Multiple values can be given for each parameter by separating them with ','
+or by specifying the parameter multiple times. Ranges can be given as
+'first-last' or 'first-last+step' or 'first-last*mult'.
+```
+
+llama-bench can perform three types of tests:
+
+- Prompt processing (pp): processing a prompt in batches (`-p`)
+- Text generation (tg): generating a sequence of tokens (`-n`)
+- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)
+
+With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
+
+Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
+
+Using the `-d <n>` option, each test can be run at a specified context depth, prefilling the KV cache with `<n>` tokens.
+
+For a description of the other options, see the [completion example](../completion/README.md).
+
+> [!NOTE]
+> The measurements with `llama-bench` do not include the times for tokenization and for sampling.
+
+## Examples
+
+### Text generation with different models
+
+```sh
+$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512
+```
+
+| model                          |       size |     params | backend    | ngl | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    132.19 ± 0.55 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 256     |    129.37 ± 0.54 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 512     |    123.83 ± 0.25 |
+| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 128     |     82.17 ± 0.31 |
+| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 256     |     80.74 ± 0.23 |
+| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 512     |     78.08 ± 0.07 |
+
+### Prompt processing with different batch sizes
+
+```sh
+$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
+```
+
+| model                          |       size |     params | backend    | ngl |    n_batch | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        128 | pp 1024    |   1436.51 ± 3.66 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        256 | pp 1024    |  1932.43 ± 23.48 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        512 | pp 1024    |  2254.45 ± 15.59 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |       1024 | pp 1024    |  2498.61 ± 13.58 |
+
+### Different numbers of threads
+
+```sh
+$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
+```
+
+| model                          |       size |     params | backend    |    threads | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | pp 64      |      6.17 ± 0.07 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | tg 16      |      4.05 ± 0.02 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | pp 64      |     12.31 ± 0.13 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | tg 16      |      7.80 ± 0.07 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | pp 64      |     23.18 ± 0.06 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | tg 16      |     12.22 ± 0.07 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | pp 64      |     32.29 ± 1.21 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | tg 16      |     16.71 ± 0.66 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | pp 64      |     33.52 ± 0.03 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | tg 16      |     15.32 ± 0.05 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | pp 64      |     59.00 ± 1.11 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | tg 16      |     16.41 ± 0.79 |
+
+### Different numbers of layers offloaded to the GPU
+
+```sh
+$ ./llama-bench -ngl 10,20,30,31,32,33,34,35
+```
+
+| model                          |       size |     params | backend    | ngl | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | pp 512     |    373.36 ± 2.25 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | tg 128     |     13.45 ± 0.93 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | pp 512     |    472.65 ± 1.25 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | tg 128     |     21.36 ± 1.94 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | pp 512     |   631.87 ± 11.25 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | tg 128     |     40.04 ± 1.82 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | pp 512     |    657.89 ± 5.08 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | tg 128     |     48.19 ± 0.81 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | pp 512     |    688.26 ± 3.29 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | tg 128     |     54.78 ± 0.65 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | pp 512     |    704.27 ± 2.24 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | tg 128     |     60.62 ± 1.76 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | pp 512     |    881.34 ± 5.40 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | tg 128     |     71.76 ± 0.23 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | pp 512     |   2400.01 ± 7.72 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | tg 128     |    131.66 ± 0.49 |
+
+### Different prefilled context
+
+```
+$ ./llama-bench -d 0,512
+```
+
+| model                          |       size |     params | backend    | ngl |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
+| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |           pp512 |      7340.20 ± 23.45 |
+| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |           tg128 |        120.60 ± 0.59 |
+| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |    pp512 @ d512 |      6425.91 ± 18.88 |
+| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |    tg128 @ d512 |        116.71 ± 0.60 |
+
+## Output formats
+
+By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
+
+### Markdown
+
+```sh
+$ ./llama-bench -o md
+```
+
+| model                          |       size |     params | backend    | ngl | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | pp 512     |  2368.80 ± 93.24 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    131.42 ± 0.59 |
+
+### CSV
+
+```sh
+$ ./llama-bench -o csv
+```
+
+```csv
+build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
+"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
+```
+
+### JSON
+
+```sh
+$ ./llama-bench -o json
+```
+
+```json
+[
+  {
+    "build_commit": "8cf427ff",
+    "build_number": 5163,
+    "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
+    "gpu_info": "NVIDIA GeForce RTX 4080",
+    "backends": "CUDA",
+    "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
+    "model_type": "qwen2 7B Q4_K - Medium",
+    "model_size": 4677120000,
+    "model_n_params": 7615616512,
+    "n_batch": 2048,
+    "n_ubatch": 512,
+    "n_threads": 8,
+    "cpu_mask": "0x0",
+    "cpu_strict": false,
+    "poll": 50,
+    "type_k": "f16",
+    "type_v": "f16",
+    "n_gpu_layers": 99,
+    "split_mode": "layer",
+    "main_gpu": 0,
+    "no_kv_offload": false,
+    "flash_attn": false,
+    "tensor_split": "0.00",
+    "use_mmap": true,
+    "embeddings": false,
+    "n_prompt": 512,
+    "n_gen": 0,
+    "n_depth": 0,
+    "test_time": "2025-04-24T11:58:50Z",
+    "avg_ns": 72135640,
+    "stddev_ns": 1453752,
+    "avg_ts": 7100.002165,
+    "stddev_ts": 140.341520,
+    "samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ],
+    "samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ]
+  },
+  {
+    "build_commit": "8cf427ff",
+    "build_number": 5163,
+    "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
+    "gpu_info": "NVIDIA GeForce RTX 4080",
+    "backends": "CUDA",
+    "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
+    "model_type": "qwen2 7B Q4_K - Medium",
+    "model_size": 4677120000,
+    "model_n_params": 7615616512,
+    "n_batch": 2048,
+    "n_ubatch": 512,
+    "n_threads": 8,
+    "cpu_mask": "0x0",
+    "cpu_strict": false,
+    "poll": 50,
+    "type_k": "f16",
+    "type_v": "f16",
+    "n_gpu_layers": 99,
+    "split_mode": "layer",
+    "main_gpu": 0,
+    "no_kv_offload": false,
+    "flash_attn": false,
+    "tensor_split": "0.00",
+    "use_mmap": true,
+    "embeddings": false,
+    "n_prompt": 0,
+    "n_gen": 128,
+    "n_depth": 0,
+    "test_time": "2025-04-24T11:58:51Z",
+    "avg_ns": 1076767880,
+    "stddev_ns": 9449585,
+    "avg_ts": 118.881588,
+    "stddev_ts": 1.041811,
+    "samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ],
+    "samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ]
+  }
+]
+```
+
+
+### JSONL
+
+```sh
+$ ./llama-bench -o jsonl
+```
+
+```json lines
+{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
+{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
+```
+
+
+### SQL
+
+SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
+
+```sh
+$ ./llama-bench -o sql
+```
+
+```sql
+CREATE TABLE IF NOT EXISTS test (
+  build_commit TEXT,
+  build_number INTEGER,
+  cpu_info TEXT,
+  gpu_info TEXT,
+  backends TEXT,
+  model_filename TEXT,
+  model_type TEXT,
+  model_size INTEGER,
+  model_n_params INTEGER,
+  n_batch INTEGER,
+  n_ubatch INTEGER,
+  n_threads INTEGER,
+  cpu_mask TEXT,
+  cpu_strict INTEGER,
+  poll INTEGER,
+  type_k TEXT,
+  type_v TEXT,
+  n_gpu_layers INTEGER,
+  split_mode TEXT,
+  main_gpu INTEGER,
+  no_kv_offload INTEGER,
+  flash_attn INTEGER,
+  tensor_split TEXT,
+  use_mmap INTEGER,
+  embeddings INTEGER,
+  n_prompt INTEGER,
+  n_gen INTEGER,
+  n_depth INTEGER,
+  test_time TEXT,
+  avg_ns INTEGER,
+  stddev_ns INTEGER,
+  avg_ts REAL,
+  stddev_ts REAL
+);
+
+INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
+INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
+```
diff --git a/llama.cpp/tools/llama-bench/llama-bench.cpp b/llama.cpp/tools/llama-bench/llama-bench.cpp
new file mode 100644
index 0000000..7da6c39
--- /dev/null
+++ b/llama.cpp/tools/llama-bench/llama-bench.cpp
@@ -0,0 +1,2291 @@
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <chrono>
+#include <cinttypes>
+#include <clocale>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <iterator>
+#include <map>
+#include <numeric>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+#include <unordered_set>
+
+#include "common.h"
+#include "ggml.h"
+#include "llama.h"
+
+#ifdef _WIN32
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#endif
+
+// utils
+static uint64_t get_time_ns() {
+    using clock = std::chrono::high_resolution_clock;
+    return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
+}
+
+static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
+    if (a.pattern != b.pattern) {
+        // cString comparison that may be null
+        if (a.pattern == nullptr || b.pattern == nullptr) {
+            return false;
+        }
+        if (strcmp(a.pattern, b.pattern) != 0) {
+            return false;
+        }
+    }
+    if (a.buft != b.buft) {
+        return false;
+    }
+    return true;
+}
+
+static bool vec_tensor_buft_override_equal(const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
+    if (a.size() != b.size()) {
+        return false;
+    }
+    for (size_t i = 0; i < a.size(); i++) {
+        if (!tensor_buft_override_equal(a[i], b[i])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool vec_vec_tensor_buft_override_equal(const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
+    if (a.size() != b.size()) {
+        return false;
+    }
+    for (size_t i = 0; i < a.size(); i++) {
+        if (!vec_tensor_buft_override_equal(a[i], b[i])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
+    std::ostringstream str;
+    for (size_t i = 0; i < values.size(); i++) {
+        str << values[i];
+        if (i < values.size() - 1) {
+            str << delim;
+        }
+    }
+    return str.str();
+}
+
+template <typename T, typename F> static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
+    std::vector<std::string> str_values;
+    std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
+    return str_values;
+}
+
+template <typename T> static T avg(const std::vector<T> & v) {
+    if (v.empty()) {
+        return 0;
+    }
+    T sum = std::accumulate(v.begin(), v.end(), T(0));
+    return sum / (T) v.size();
+}
+
+template <typename T> static T stdev(const std::vector<T> & v) {
+    if (v.size() <= 1) {
+        return 0;
+    }
+    T mean   = avg(v);
+    T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
+    T stdev  = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1));
+    return stdev;
+}
+
+static std::string get_cpu_info() {
+    std::vector<std::string> cpu_list;
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        auto * dev      = ggml_backend_dev_get(i);
+        auto   dev_type = ggml_backend_dev_type(dev);
+        if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+            cpu_list.push_back(ggml_backend_dev_description(dev));
+        }
+    }
+    return join(cpu_list, ", ");
+}
+
+static std::string get_gpu_info() {
+    std::vector<std::string> gpu_list;
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        auto * dev      = ggml_backend_dev_get(i);
+        auto   dev_type = ggml_backend_dev_type(dev);
+        if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU || dev_type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
+            gpu_list.push_back(ggml_backend_dev_description(dev));
+        }
+    }
+    return join(gpu_list, ", ");
+}
+
+static std::vector<ggml_backend_dev_t> parse_devices_arg(const std::string & value) {
+    std::vector<ggml_backend_dev_t> devices;
+    std::string                     trimmed = string_strip(value);
+    if (trimmed.empty()) {
+        throw std::invalid_argument("no devices specified");
+    }
+    if (trimmed == "auto") {
+        return devices;
+    }
+
+    auto dev_names = string_split<std::string>(trimmed, '/');
+    if (dev_names.size() == 1 && string_strip(dev_names[0]) == "none") {
+        devices.push_back(nullptr);
+        return devices;
+    }
+
+    for (auto & name : dev_names) {
+        std::string dev_name = string_strip(name);
+        if (dev_name.empty()) {
+            throw std::invalid_argument("invalid device specification");
+        }
+        auto * dev = ggml_backend_dev_by_name(dev_name.c_str());
+        if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+            throw std::invalid_argument(string_format("invalid device: %s", dev_name.c_str()));
+        }
+        devices.push_back(dev);
+    }
+
+    devices.push_back(nullptr);
+    return devices;
+}
+
+static void register_rpc_server_list(const std::string & servers) {
+    auto rpc_servers = string_split<std::string>(servers, ',');
+    if (rpc_servers.empty()) {
+        throw std::invalid_argument("no RPC servers specified");
+    }
+
+    auto * rpc_reg = ggml_backend_reg_by_name("RPC");
+    if (!rpc_reg) {
+        throw std::invalid_argument("failed to find RPC backend");
+    }
+
+    using add_rpc_server_fn = ggml_backend_reg_t (*)(const char * endpoint);
+    auto * ggml_backend_rpc_add_server_fn = (add_rpc_server_fn) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
+    if (!ggml_backend_rpc_add_server_fn) {
+        throw std::invalid_argument("failed to find RPC add server function");
+    }
+    for (const auto & server : rpc_servers) {
+        auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
+        ggml_backend_register(reg);
+    }
+}
+
+static std::string devices_to_string(const std::vector<ggml_backend_dev_t> & devices) {
+    if (devices.empty()) {
+        return "auto";
+    }
+
+    if (devices.size() == 1 && devices[0] == nullptr) {
+        return "none";
+    }
+
+    std::vector<std::string> names;
+    for (auto * dev : devices) {
+        if (dev == nullptr) {
+            break;
+        }
+        names.push_back(ggml_backend_dev_name(dev));
+    }
+
+    return join(names, "/");
+}
+
+// command line params
+enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
+
+static const char * output_format_str(output_formats format) {
+    switch (format) {
+        case NONE:
+            return "none";
+        case CSV:
+            return "csv";
+        case JSON:
+            return "json";
+        case JSONL:
+            return "jsonl";
+        case MARKDOWN:
+            return "md";
+        case SQL:
+            return "sql";
+        default:
+            GGML_ABORT("invalid output format");
+    }
+}
+
+static bool output_format_from_str(const std::string & s, output_formats & format) {
+    if (s == "none") {
+        format = NONE;
+    } else if (s == "csv") {
+        format = CSV;
+    } else if (s == "json") {
+        format = JSON;
+    } else if (s == "jsonl") {
+        format = JSONL;
+    } else if (s == "md") {
+        format = MARKDOWN;
+    } else if (s == "sql") {
+        format = SQL;
+    } else {
+        return false;
+    }
+    return true;
+}
+
+static const char * split_mode_str(llama_split_mode mode) {
+    switch (mode) {
+        case LLAMA_SPLIT_MODE_NONE:
+            return "none";
+        case LLAMA_SPLIT_MODE_LAYER:
+            return "layer";
+        case LLAMA_SPLIT_MODE_ROW:
+            return "row";
+        default:
+            GGML_ABORT("invalid split mode");
+    }
+}
+
+static std::string pair_str(const std::pair<int, int> & p) {
+    static char buf[32];
+    snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
+    return buf;
+}
+
+static std::vector<int> parse_int_range(const std::string & s) {
+    // first[-last[(+|*)step]]
+    std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
+
+    std::smatch match;
+    std::string::const_iterator search_start(s.cbegin());
+    std::vector<int> result;
+    while (std::regex_search(search_start, s.cend(), match, range_regex)) {
+        int  first = std::stoi(match[1]);
+        int  last  = match[2].matched ? std::stoi(match[2]) : first;
+        char op    = match[3].matched ? match[3].str()[0] : '+';
+        int  step  = match[4].matched ? std::stoi(match[4]) : 1;
+
+        for (int i = first; i <= last;) {
+            result.push_back(i);
+
+            int prev_i = i;
+
+            if (op == '+') {
+                i += step;
+            } else if (op == '*') {
+                i *= step;
+            } else {
+                throw std::invalid_argument("invalid range format");
+            }
+
+            if (i <= prev_i) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+        search_start = match.suffix().first;
+    }
+
+    if (search_start != s.cend()) {
+        throw std::invalid_argument("invalid range format");
+    }
+
+    return result;
+}
+
+struct cmd_params {
+    std::vector<std::string>         model;
+    std::vector<int>                 n_prompt;
+    std::vector<int>                 n_gen;
+    std::vector<std::pair<int, int>> n_pg;
+    std::vector<int>                 n_depth;
+    std::vector<int>                 n_batch;
+    std::vector<int>                 n_ubatch;
+    std::vector<ggml_type>           type_k;
+    std::vector<ggml_type>           type_v;
+    std::vector<int>                 n_threads;
+    std::vector<std::string>         cpu_mask;
+    std::vector<bool>                cpu_strict;
+    std::vector<int>                 poll;
+    std::vector<int>                 n_gpu_layers;
+    std::vector<int>                 n_cpu_moe;
+    std::vector<llama_split_mode>    split_mode;
+    std::vector<int>                 main_gpu;
+    std::vector<bool>                no_kv_offload;
+    std::vector<bool>                flash_attn;
+    std::vector<std::vector<ggml_backend_dev_t>> devices;
+    std::vector<std::vector<float>>  tensor_split;
+    std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
+    std::vector<bool>                use_mmap;
+    std::vector<bool>                use_direct_io;
+    std::vector<bool>                embeddings;
+    std::vector<bool>                no_op_offload;
+    std::vector<bool>                no_host;
+    ggml_numa_strategy               numa;
+    int                              reps;
+    ggml_sched_priority              prio;
+    int                              delay;
+    bool                             verbose;
+    bool                             progress;
+    bool                             no_warmup;
+    output_formats                   output_format;
+    output_formats                   output_format_stderr;
+};
+
+static const cmd_params cmd_params_defaults = {
+    /* model                */ { "models/7B/ggml-model-q4_0.gguf" },
+    /* n_prompt             */ { 512 },
+    /* n_gen                */ { 128 },
+    /* n_pg                 */ {},
+    /* n_depth              */ { 0 },
+    /* n_batch              */ { 2048 },
+    /* n_ubatch             */ { 512 },
+    /* type_k               */ { GGML_TYPE_F16 },
+    /* type_v               */ { GGML_TYPE_F16 },
+    /* n_threads            */ { cpu_get_num_math() },
+    /* cpu_mask             */ { "0x0" },
+    /* cpu_strict           */ { false },
+    /* poll                 */ { 50 },
+    /* n_gpu_layers         */ { 99 },
+    /* n_cpu_moe            */ { 0 },
+    /* split_mode           */ { LLAMA_SPLIT_MODE_LAYER },
+    /* main_gpu             */ { 0 },
+    /* no_kv_offload        */ { false },
+    /* flash_attn           */ { false },
+    /* devices              */ { {} },
+    /* tensor_split         */ { std::vector<float>(llama_max_devices(), 0.0f) },
+    /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
+    /* use_mmap             */ { false },
+    /* use_direct_io        */ { false },
+    /* embeddings           */ { false },
+    /* no_op_offload        */ { false },
+    /* no_host              */ { false },
+    /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
+    /* reps                 */ 5,
+    /* prio                 */ GGML_SCHED_PRIO_NORMAL,
+    /* delay                */ 0,
+    /* verbose              */ false,
+    /* progress             */ false,
+    /* no_warmup            */ false,
+    /* output_format        */ MARKDOWN,
+    /* output_format_stderr */ NONE,
+};
+
+static void print_usage(int /* argc */, char ** argv) {
+    printf("usage: %s [options]\n", argv[0]);
+    printf("\n");
+    printf("options:\n");
+    printf("  -h, --help\n");
+    printf("  --numa <distribute|isolate|numactl>       numa mode (default: disabled)\n");
+    printf("  -r, --repetitions <n>                     number of times to repeat each test (default: %d)\n",
+           cmd_params_defaults.reps);
+    printf("  --prio <-1|0|1|2|3>                          process/thread priority (default: %d)\n",
+           cmd_params_defaults.prio);
+    printf("  --delay <0...N> (seconds)                 delay between each test (default: %d)\n",
+           cmd_params_defaults.delay);
+    printf("  -o, --output <csv|json|jsonl|md|sql>      output format printed to stdout (default: %s)\n",
+           output_format_str(cmd_params_defaults.output_format));
+    printf("  -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
+           output_format_str(cmd_params_defaults.output_format_stderr));
+    printf("  --list-devices                            list available devices and exit\n");
+    printf("  -v, --verbose                             verbose output\n");
+    printf("  --progress                                print test progress indicators\n");
+    printf("  --no-warmup                               skip warmup runs before benchmarking\n");
+    if (llama_supports_rpc()) {
+        printf("  -rpc, --rpc <rpc_servers>                 register RPC devices (comma separated)\n");
+    }
+    printf("\n");
+    printf("test parameters:\n");
+    printf("  -m, --model <filename>                    (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+    printf("  -p, --n-prompt <n>                        (default: %s)\n",
+           join(cmd_params_defaults.n_prompt, ",").c_str());
+    printf("  -n, --n-gen <n>                           (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+    printf("  -pg <pp,tg>                               (default: %s)\n",
+           join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
+    printf("  -d, --n-depth <n>                         (default: %s)\n",
+           join(cmd_params_defaults.n_depth, ",").c_str());
+    printf("  -b, --batch-size <n>                      (default: %s)\n",
+           join(cmd_params_defaults.n_batch, ",").c_str());
+    printf("  -ub, --ubatch-size <n>                    (default: %s)\n",
+           join(cmd_params_defaults.n_ubatch, ",").c_str());
+    printf("  -ctk, --cache-type-k <t>                  (default: %s)\n",
+           join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+    printf("  -ctv, --cache-type-v <t>                  (default: %s)\n",
+           join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
+    printf("  -t, --threads <n>                         (default: %s)\n",
+           join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n",
+           join(cmd_params_defaults.cpu_mask, ",").c_str());
+    printf("  --cpu-strict <0|1>                        (default: %s)\n",
+           join(cmd_params_defaults.cpu_strict, ",").c_str());
+    printf("  --poll <0...100>                          (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
+    printf("  -ngl, --n-gpu-layers <n>                  (default: %s)\n",
+           join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    printf("  -ncmoe, --n-cpu-moe <n>                   (default: %s)\n",
+           join(cmd_params_defaults.n_cpu_moe, ",").c_str());
+    printf("  -sm, --split-mode <none|layer|row>        (default: %s)\n",
+           join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
+    printf("  -mg, --main-gpu <i>                       (default: %s)\n",
+           join(cmd_params_defaults.main_gpu, ",").c_str());
+    printf("  -nkvo, --no-kv-offload <0|1>              (default: %s)\n",
+           join(cmd_params_defaults.no_kv_offload, ",").c_str());
+    printf("  -fa, --flash-attn <0|1>                   (default: %s)\n",
+           join(cmd_params_defaults.flash_attn, ",").c_str());
+    printf("  -dev, --device <dev0/dev1/...>            (default: auto)\n");
+    printf("  -mmp, --mmap <0|1>                        (default: %s)\n",
+           join(cmd_params_defaults.use_mmap, ",").c_str());
+    printf("  -dio, --direct-io <0|1>                   (default: %s)\n",
+           join(cmd_params_defaults.use_direct_io, ",").c_str());
+    printf("  -embd, --embeddings <0|1>                 (default: %s)\n",
+           join(cmd_params_defaults.embeddings, ",").c_str());
+    printf("  -ts, --tensor-split <ts0/ts1/..>          (default: 0)\n");
+    printf("  -ot --override-tensor <tensor name pattern>=<buffer type>;...\n");
+    printf("                                            (default: disabled)\n");
+    printf("  -nopo, --no-op-offload <0|1>              (default: 0)\n");
+    printf("  --no-host <0|1>                           (default: %s)\n",
+           join(cmd_params_defaults.no_host, ",").c_str());
+    printf("\n");
+    printf(
+        "Multiple values can be given for each parameter by separating them with ','\n"
+        "or by specifying the parameter multiple times. Ranges can be given as\n"
+        "'first-last' or 'first-last+step' or 'first-last*mult'.\n");
+}
+
+static ggml_type ggml_type_from_name(const std::string & s) {
+    if (s == "f16") {
+        return GGML_TYPE_F16;
+    }
+    if (s == "bf16") {
+        return GGML_TYPE_BF16;
+    }
+    if (s == "q8_0") {
+        return GGML_TYPE_Q8_0;
+    }
+    if (s == "q4_0") {
+        return GGML_TYPE_Q4_0;
+    }
+    if (s == "q4_1") {
+        return GGML_TYPE_Q4_1;
+    }
+    if (s == "q5_0") {
+        return GGML_TYPE_Q5_0;
+    }
+    if (s == "q5_1") {
+        return GGML_TYPE_Q5_1;
+    }
+    if (s == "iq4_nl") {
+        return GGML_TYPE_IQ4_NL;
+    }
+
+    return GGML_TYPE_COUNT;
+}
+
+static cmd_params parse_cmd_params(int argc, char ** argv) {
+    cmd_params        params;
+    std::string       arg;
+    bool              invalid_param = false;
+    const std::string arg_prefix    = "--";
+    const char        split_delim   = ',';
+
+    params.verbose              = cmd_params_defaults.verbose;
+    params.output_format        = cmd_params_defaults.output_format;
+    params.output_format_stderr = cmd_params_defaults.output_format_stderr;
+    params.reps                 = cmd_params_defaults.reps;
+    params.numa                 = cmd_params_defaults.numa;
+    params.prio                 = cmd_params_defaults.prio;
+    params.delay                = cmd_params_defaults.delay;
+    params.progress             = cmd_params_defaults.progress;
+    params.no_warmup            = cmd_params_defaults.no_warmup;
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        try {
+            if (arg == "-h" || arg == "--help") {
+                print_usage(argc, argv);
+                exit(0);
+            } else if (arg == "-m" || arg == "--model") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<std::string>(argv[i], split_delim);
+                params.model.insert(params.model.end(), p.begin(), p.end());
+            } else if (arg == "-p" || arg == "--n-prompt") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = parse_int_range(argv[i]);
+                params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
+            } else if (arg == "-n" || arg == "--n-gen") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = parse_int_range(argv[i]);
+                params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
+            } else if (arg == "-pg") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<std::string>(argv[i], ',');
+                if (p.size() != 2) {
+                    invalid_param = true;
+                    break;
+                }
+                params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
+            } else if (arg == "-d" || arg == "--n-depth") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = parse_int_range(argv[i]);
+                params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
+            } else if (arg == "-b" || arg == "--batch-size") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = parse_int_range(argv[i]);
+                params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
+            } else if (arg == "-ub" || arg == "--ubatch-size") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = parse_int_range(argv[i]);
+                params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
+            } else if (arg == "-ctk" || arg == "--cache-type-k") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<std::string>(argv[i], split_delim);
+
+                std::vector<ggml_type> types;
+                for (const auto & t : p) {
+                    ggml_type gt = ggml_type_from_name(t);
+                    if (gt == GGML_TYPE_COUNT) {
+                        invalid_param = true;
+                        break;
+                    }
+                    types.push_back(gt);
+                }
+                if (invalid_param) {
+                    break;
+                }
+                params.type_k.insert(params.type_k.end(), types.begin(), types.end());
+            } else if (arg == "-ctv" || arg == "--cache-type-v") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<std::string>(argv[i], split_delim);
+
+                std::vector<ggml_type> types;
+                for (const auto & t : p) {
+                    ggml_type gt = ggml_type_from_name(t);
+                    if (gt == GGML_TYPE_COUNT) {
+                        invalid_param = true;
+                        break;
+                    }
+                    types.push_back(gt);
+                }
+                if (invalid_param) {
+                    break;
+                }
+                params.type_v.insert(params.type_v.end(), types.begin(), types.end());
+            } else if (arg == "-dev" || arg == "--device") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto combos = string_split<std::string>(argv[i], split_delim);
+                for (const auto & combo : combos) {
+                    try {
+                        params.devices.push_back(parse_devices_arg(combo));
+                    } catch (const std::exception & e) {
+                        fprintf(stderr, "error: %s\n", e.what());
+                        invalid_param = true;
+                        break;
+                    }
+                }
+                if (invalid_param) {
+                    break;
+                }
+            } else if (arg == "--list-devices") {
+                std::vector<ggml_backend_dev_t> devices;
+                for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+                    auto * dev = ggml_backend_dev_get(i);
+                    if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
+                        devices.push_back(dev);
+                    }
+                }
+                printf("Available devices:\n");
+                if (devices.empty()) {
+                    printf("  (none)\n");
+                }
+                for (auto * dev : devices) {
+                    size_t free, total;
+                    ggml_backend_dev_memory(dev, &free, &total);
+                    printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+                }
+                exit(0);
+            } else if (arg == "-t" || arg == "--threads") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = parse_int_range(argv[i]);
+                params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+            } else if (arg == "-C" || arg == "--cpu-mask") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<std::string>(argv[i], split_delim);
+                params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
+            } else if (arg == "--cpu-strict") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<bool>(argv[i], split_delim);
+                params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
+            } else if (arg == "--poll") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = parse_int_range(argv[i]);
+                params.poll.insert(params.poll.end(), p.begin(), p.end());
+            } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = parse_int_range(argv[i]);
+                params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
+            } else if (arg == "-ncmoe" || arg == "--n-cpu-moe") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = parse_int_range(argv[i]);
+                params.n_cpu_moe.insert(params.n_cpu_moe.end(), p.begin(), p.end());
+            } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                try {
+                    register_rpc_server_list(argv[i]);
+                } catch (const std::exception & e) {
+                    fprintf(stderr, "error: %s\n", e.what());
+                    invalid_param = true;
+                    break;
+                }
+            } else if (arg == "-sm" || arg == "--split-mode") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<std::string>(argv[i], split_delim);
+
+                std::vector<llama_split_mode> modes;
+                for (const auto & m : p) {
+                    llama_split_mode mode;
+                    if (m == "none") {
+                        mode = LLAMA_SPLIT_MODE_NONE;
+                    } else if (m == "layer") {
+                        mode = LLAMA_SPLIT_MODE_LAYER;
+                    } else if (m == "row") {
+                        mode = LLAMA_SPLIT_MODE_ROW;
+                    } else {
+                        invalid_param = true;
+                        break;
+                    }
+                    modes.push_back(mode);
+                }
+                if (invalid_param) {
+                    break;
+                }
+                params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
+            } else if (arg == "-mg" || arg == "--main-gpu") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                params.main_gpu = parse_int_range(argv[i]);
+            } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<bool>(argv[i], split_delim);
+                params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
+            } else if (arg == "--numa") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                std::string value(argv[i]);
+                if (value == "distribute" || value == "") {
+                    params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
+                } else if (value == "isolate") {
+                    params.numa = GGML_NUMA_STRATEGY_ISOLATE;
+                } else if (value == "numactl") {
+                    params.numa = GGML_NUMA_STRATEGY_NUMACTL;
+                } else {
+                    invalid_param = true;
+                    break;
+                }
+            } else if (arg == "-fa" || arg == "--flash-attn") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<bool>(argv[i], split_delim);
+                params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
+            } else if (arg == "-mmp" || arg == "--mmap") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<bool>(argv[i], split_delim);
+                params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
+            } else if (arg == "-dio" || arg == "--direct-io") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<bool>(argv[i], split_delim);
+                params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
+            } else if (arg == "-embd" || arg == "--embeddings") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<bool>(argv[i], split_delim);
+                params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
+            } else if (arg == "-nopo" || arg == "--no-op-offload") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<bool>(argv[i], split_delim);
+                params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
+            } else if (arg == "--no-host") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<bool>(argv[i], split_delim);
+                params.no_host.insert(params.no_host.end(), p.begin(), p.end());
+            } else if (arg == "-ts" || arg == "--tensor-split") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                for (auto ts : string_split<std::string>(argv[i], split_delim)) {
+                    // split string by ; and /
+                    const std::regex           regex{ R"([;/]+)" };
+                    std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
+                    std::vector<std::string>   split_arg{ it, {} };
+                    GGML_ASSERT(split_arg.size() <= llama_max_devices());
+
+                    std::vector<float> tensor_split(llama_max_devices());
+                    for (size_t i = 0; i < llama_max_devices(); ++i) {
+                        if (i < split_arg.size()) {
+                            tensor_split[i] = std::stof(split_arg[i]);
+                        } else {
+                            tensor_split[i] = 0.0f;
+                        }
+                    }
+                    params.tensor_split.push_back(tensor_split);
+                }
+            } else if (arg == "-ot" || arg == "--override-tensor") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto * value = argv[i];
+                /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
+                if (buft_list.empty()) {
+                    // enumerate all the devices and add their buffer types to the list
+                    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+                        auto * dev = ggml_backend_dev_get(i);
+                        auto * buft = ggml_backend_dev_buffer_type(dev);
+                        if (buft) {
+                            buft_list[ggml_backend_buft_name(buft)] = buft;
+                        }
+                    }
+                }
+                auto override_group_span_len = std::strcspn(value, ",");
+                bool last_group = false;
+                do {
+                    if (override_group_span_len == 0) {
+                        // Adds an empty override-tensors for an empty span
+                        params.tensor_buft_overrides.push_back({{}});
+                        if (value[override_group_span_len] == '\0') {
+                            value = &value[override_group_span_len];
+                            last_group = true;
+                        } else {
+                            value = &value[override_group_span_len + 1];
+                            override_group_span_len = std::strcspn(value, ",");
+                        }
+                        continue;
+                    }
+                    // Stamps null terminators into the argv
+                    // value for this option to avoid the
+                    // memory leak present in the implementation
+                    // over in arg.cpp. Acceptable because we
+                    // only parse these args once in this program.
+                    auto * override_group = value;
+                    if (value[override_group_span_len] == '\0') {
+                        value = &value[override_group_span_len];
+                        last_group = true;
+                    } else {
+                        value[override_group_span_len] = '\0';
+                        value = &value[override_group_span_len + 1];
+                    }
+                    std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
+                    auto override_span_len = std::strcspn(override_group, ";");
+                    while (override_span_len > 0) {
+                        auto * override = override_group;
+                        if (override_group[override_span_len] != '\0') {
+                            override_group[override_span_len] = '\0';
+                            override_group = &override_group[override_span_len + 1];
+                        } else {
+                            override_group = &override_group[override_span_len];
+                        }
+                        auto tensor_name_span_len = std::strcspn(override, "=");
+                        if (tensor_name_span_len >= override_span_len) {
+                            invalid_param = true;
+                            break;
+                        }
+                        override[tensor_name_span_len] = '\0';
+                        auto * tensor_name = override;
+                        auto * buffer_type = &override[tensor_name_span_len + 1];
+                        if (buft_list.find(buffer_type) == buft_list.end()) {
+                            printf("error: unrecognized buffer type '%s'\n", buffer_type);
+                            printf("Available buffer types:\n");
+                            for (const auto & it : buft_list) {
+                                printf("  %s\n", ggml_backend_buft_name(it.second));
+                            }
+                            invalid_param = true;
+                            break;
+                        }
+                        group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
+                        override_span_len = std::strcspn(override_group, ";");
+                    }
+                    if (invalid_param) {
+                        break;
+                    }
+                    group_tensor_buft_overrides.push_back({nullptr,nullptr});
+                    params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
+                    override_group_span_len = std::strcspn(value, ",");
+                } while (!last_group);
+            } else if (arg == "-r" || arg == "--repetitions") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                params.reps = std::stoi(argv[i]);
+            } else if (arg == "--prio") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
+            } else if (arg == "--delay") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                params.delay = std::stoi(argv[i]);
+            } else if (arg == "-o" || arg == "--output") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                invalid_param = !output_format_from_str(argv[i], params.output_format);
+            } else if (arg == "-oe" || arg == "--output-err") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
+            } else if (arg == "-v" || arg == "--verbose") {
+                params.verbose = true;
+            } else if (arg == "--progress") {
+                params.progress = true;
+            } else if (arg == "--no-warmup") {
+                params.no_warmup = true;
+            } else {
+                invalid_param = true;
+                break;
+            }
+        } catch (const std::exception & e) {
+            fprintf(stderr, "error: %s\n", e.what());
+            invalid_param = true;
+            break;
+        }
+    }
+
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    // set defaults
+    if (params.model.empty()) {
+        params.model = cmd_params_defaults.model;
+    }
+    if (params.n_prompt.empty()) {
+        params.n_prompt = cmd_params_defaults.n_prompt;
+    }
+    if (params.n_gen.empty()) {
+        params.n_gen = cmd_params_defaults.n_gen;
+    }
+    if (params.n_pg.empty()) {
+        params.n_pg = cmd_params_defaults.n_pg;
+    }
+    if (params.n_depth.empty()) {
+        params.n_depth = cmd_params_defaults.n_depth;
+    }
+    if (params.n_batch.empty()) {
+        params.n_batch = cmd_params_defaults.n_batch;
+    }
+    if (params.n_ubatch.empty()) {
+        params.n_ubatch = cmd_params_defaults.n_ubatch;
+    }
+    if (params.type_k.empty()) {
+        params.type_k = cmd_params_defaults.type_k;
+    }
+    if (params.type_v.empty()) {
+        params.type_v = cmd_params_defaults.type_v;
+    }
+    if (params.n_gpu_layers.empty()) {
+        params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
+    }
+    if (params.n_cpu_moe.empty()) {
+        params.n_cpu_moe = cmd_params_defaults.n_cpu_moe;
+    }
+    if (params.split_mode.empty()) {
+        params.split_mode = cmd_params_defaults.split_mode;
+    }
+    if (params.main_gpu.empty()) {
+        params.main_gpu = cmd_params_defaults.main_gpu;
+    }
+    if (params.no_kv_offload.empty()) {
+        params.no_kv_offload = cmd_params_defaults.no_kv_offload;
+    }
+    if (params.flash_attn.empty()) {
+        params.flash_attn = cmd_params_defaults.flash_attn;
+    }
+    if (params.devices.empty()) {
+        params.devices = cmd_params_defaults.devices;
+    }
+    if (params.tensor_split.empty()) {
+        params.tensor_split = cmd_params_defaults.tensor_split;
+    }
+    if (params.tensor_buft_overrides.empty()) {
+        params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
+    }
+    if (params.use_mmap.empty()) {
+        params.use_mmap = cmd_params_defaults.use_mmap;
+    }
+    if (params.use_direct_io.empty()) {
+        params.use_direct_io = cmd_params_defaults.use_direct_io;
+    }
+    if (params.embeddings.empty()) {
+        params.embeddings = cmd_params_defaults.embeddings;
+    }
+    if (params.no_op_offload.empty()) {
+        params.no_op_offload = cmd_params_defaults.no_op_offload;
+    }
+    if (params.no_host.empty()) {
+        params.no_host = cmd_params_defaults.no_host;
+    }
+    if (params.n_threads.empty()) {
+        params.n_threads = cmd_params_defaults.n_threads;
+    }
+    if (params.cpu_mask.empty()) {
+        params.cpu_mask = cmd_params_defaults.cpu_mask;
+    }
+    if (params.cpu_strict.empty()) {
+        params.cpu_strict = cmd_params_defaults.cpu_strict;
+    }
+    if (params.poll.empty()) {
+        params.poll = cmd_params_defaults.poll;
+    }
+
+    return params;
+}
+
+struct cmd_params_instance {
+    std::string        model;
+    int                n_prompt;
+    int                n_gen;
+    int                n_depth;
+    int                n_batch;
+    int                n_ubatch;
+    ggml_type          type_k;
+    ggml_type          type_v;
+    int                n_threads;
+    std::string        cpu_mask;
+    bool               cpu_strict;
+    int                poll;
+    int                n_gpu_layers;
+    int                n_cpu_moe;
+    llama_split_mode   split_mode;
+    int                main_gpu;
+    bool               no_kv_offload;
+    bool               flash_attn;
+    std::vector<ggml_backend_dev_t> devices;
+    std::vector<float> tensor_split;
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
+    bool               use_mmap;
+    bool               use_direct_io;
+    bool               embeddings;
+    bool               no_op_offload;
+    bool               no_host;
+
+    llama_model_params to_llama_mparams() const {
+        llama_model_params mparams = llama_model_default_params();
+
+        mparams.n_gpu_layers = n_gpu_layers;
+        if (!devices.empty()) {
+            mparams.devices = const_cast<ggml_backend_dev_t *>(devices.data());
+        }
+        mparams.split_mode    = split_mode;
+        mparams.main_gpu      = main_gpu;
+        mparams.tensor_split  = tensor_split.data();
+        mparams.use_mmap      = use_mmap;
+        mparams.use_direct_io = use_direct_io;
+        mparams.no_host       = no_host;
+
+        if (n_cpu_moe <= 0) {
+            if (tensor_buft_overrides.empty()) {
+                mparams.tensor_buft_overrides = nullptr;
+            } else {
+                GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr &&
+                            "Tensor buffer overrides not terminated with empty pattern");
+                mparams.tensor_buft_overrides = tensor_buft_overrides.data();
+            }
+        } else {
+            static std::vector<llama_model_tensor_buft_override> merged;
+            static std::vector<std::string> patterns;
+
+            merged.clear();
+            patterns.clear();
+
+            auto first = tensor_buft_overrides.begin();
+            auto last  = tensor_buft_overrides.end();
+            if (first != last && (last - 1)->pattern == nullptr) {
+                --last;
+            }
+            merged.insert(merged.end(), first, last);
+
+            patterns.reserve((size_t) n_cpu_moe);
+            merged.reserve(merged.size() + (size_t) n_cpu_moe + 1);
+
+            for (int i = 0; i < n_cpu_moe; ++i) {
+                patterns.push_back(llm_ffn_exps_block_regex(i));
+                merged.push_back({ patterns.back().c_str(),
+                                ggml_backend_cpu_buffer_type() });
+            }
+
+            merged.push_back({ nullptr, nullptr });
+
+            mparams.tensor_buft_overrides = merged.data();
+        }
+
+        return mparams;
+    }
+
+    bool equal_mparams(const cmd_params_instance & other) const {
+        return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
+               split_mode == other.split_mode &&
+               main_gpu == other.main_gpu && tensor_split == other.tensor_split &&
+               use_mmap == other.use_mmap && use_direct_io == other.use_direct_io &&
+               devices == other.devices &&
+               no_host == other.no_host &&
+               vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
+    }
+
+    llama_context_params to_llama_cparams() const {
+        llama_context_params cparams = llama_context_default_params();
+
+        cparams.n_ctx           = n_prompt + n_gen + n_depth;
+        cparams.n_batch         = n_batch;
+        cparams.n_ubatch        = n_ubatch;
+        cparams.type_k          = type_k;
+        cparams.type_v          = type_v;
+        cparams.offload_kqv     = !no_kv_offload;
+        cparams.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
+        cparams.embeddings      = embeddings;
+        cparams.op_offload      = !no_op_offload;
+        cparams.swa_full        = false;
+
+        return cparams;
+    }
+};
+
+static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
+    std::vector<cmd_params_instance> instances;
+
+    // this ordering minimizes the number of times that each model needs to be reloaded
+    // clang-format off
+    for (const auto & m : params.model)
+    for (const auto & nl : params.n_gpu_layers)
+    for (const auto & ncmoe : params.n_cpu_moe)
+    for (const auto & sm : params.split_mode)
+    for (const auto & mg : params.main_gpu)
+    for (const auto & devs : params.devices)
+    for (const auto & ts : params.tensor_split)
+    for (const auto & ot : params.tensor_buft_overrides)
+    for (const auto & mmp : params.use_mmap)
+    for (const auto & dio : params.use_direct_io)
+    for (const auto & noh : params.no_host)
+    for (const auto & embd : params.embeddings)
+    for (const auto & nopo : params.no_op_offload)
+    for (const auto & nb : params.n_batch)
+    for (const auto & nub : params.n_ubatch)
+    for (const auto & tk : params.type_k)
+    for (const auto & tv : params.type_v)
+    for (const auto & nkvo : params.no_kv_offload)
+    for (const auto & fa : params.flash_attn)
+    for (const auto & nt : params.n_threads)
+    for (const auto & cm : params.cpu_mask)
+    for (const auto & cs : params.cpu_strict)
+    for (const auto & nd : params.n_depth)
+    for (const auto & pl : params.poll) {
+        for (const auto & n_prompt : params.n_prompt) {
+            if (n_prompt == 0) {
+                continue;
+            }
+            cmd_params_instance instance = {
+                /* .model        = */ m,
+                /* .n_prompt     = */ n_prompt,
+                /* .n_gen        = */ 0,
+                /* .n_depth      = */ nd,
+                /* .n_batch      = */ nb,
+                /* .n_ubatch     = */ nub,
+                /* .type_k       = */ tk,
+                /* .type_v       = */ tv,
+                /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
+                /* .n_gpu_layers = */ nl,
+                /* .n_cpu_moe    = */ ncmoe,
+                /* .split_mode   = */ sm,
+                /* .main_gpu     = */ mg,
+                /* .no_kv_offload= */ nkvo,
+                /* .flash_attn   = */ fa,
+                /* .devices      = */ devs,
+                /* .tensor_split = */ ts,
+                /* .tensor_buft_overrides = */ ot,
+                /* .use_mmap     = */ mmp,
+                /* .use_direct_io= */ dio,
+                /* .embeddings   = */ embd,
+                /* .no_op_offload= */ nopo,
+                /* .no_host      = */ noh,
+            };
+            instances.push_back(instance);
+        }
+
+        for (const auto & n_gen : params.n_gen) {
+            if (n_gen == 0) {
+                continue;
+            }
+            cmd_params_instance instance = {
+                /* .model        = */ m,
+                /* .n_prompt     = */ 0,
+                /* .n_gen        = */ n_gen,
+                /* .n_depth      = */ nd,
+                /* .n_batch      = */ nb,
+                /* .n_ubatch     = */ nub,
+                /* .type_k       = */ tk,
+                /* .type_v       = */ tv,
+                /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
+                /* .n_gpu_layers = */ nl,
+                /* .n_cpu_moe    = */ ncmoe,
+                /* .split_mode   = */ sm,
+                /* .main_gpu     = */ mg,
+                /* .no_kv_offload= */ nkvo,
+                /* .flash_attn   = */ fa,
+                /* .devices      = */ devs,
+                /* .tensor_split = */ ts,
+                /* .tensor_buft_overrides = */ ot,
+                /* .use_mmap     = */ mmp,
+                /* .use_direct_io= */ dio,
+                /* .embeddings   = */ embd,
+                /* .no_op_offload= */ nopo,
+                /* .no_host      = */ noh,
+            };
+            instances.push_back(instance);
+        }
+
+        for (const auto & n_pg : params.n_pg) {
+            if (n_pg.first == 0 && n_pg.second == 0) {
+                continue;
+            }
+            cmd_params_instance instance = {
+                /* .model        = */ m,
+                /* .n_prompt     = */ n_pg.first,
+                /* .n_gen        = */ n_pg.second,
+                /* .n_depth      = */ nd,
+                /* .n_batch      = */ nb,
+                /* .n_ubatch     = */ nub,
+                /* .type_k       = */ tk,
+                /* .type_v       = */ tv,
+                /* .n_threads    = */ nt,
+                /* .cpu_mask     = */ cm,
+                /* .cpu_strict   = */ cs,
+                /* .poll         = */ pl,
+                /* .n_gpu_layers = */ nl,
+                /* .n_cpu_moe    = */ ncmoe,
+                /* .split_mode   = */ sm,
+                /* .main_gpu     = */ mg,
+                /* .no_kv_offload= */ nkvo,
+                /* .flash_attn   = */ fa,
+                /* .devices      = */ devs,
+                /* .tensor_split = */ ts,
+                /* .tensor_buft_overrides = */ ot,
+                /* .use_mmap     = */ mmp,
+                /* .use_direct_io= */ dio,
+                /* .embeddings   = */ embd,
+                /* .no_op_offload= */ nopo,
+                /* .no_host      = */ noh,
+            };
+            instances.push_back(instance);
+        }
+    }
+    // clang-format on
+
+    return instances;
+}
+
+struct test {
+    static const std::string build_commit;
+    static const int         build_number;
+    const std::string        cpu_info;
+    const std::string        gpu_info;
+    std::string              model_filename;
+    std::string              model_type;
+    uint64_t                 model_size;
+    uint64_t                 model_n_params;
+    int                      n_batch;
+    int                      n_ubatch;
+    int                      n_threads;
+    std::string              cpu_mask;
+    bool                     cpu_strict;
+    int                      poll;
+    ggml_type                type_k;
+    ggml_type                type_v;
+    int                      n_gpu_layers;
+    int                      n_cpu_moe;
+    llama_split_mode         split_mode;
+    int                      main_gpu;
+    bool                     no_kv_offload;
+    bool                     flash_attn;
+    std::vector<ggml_backend_dev_t> devices;
+    std::vector<float>       tensor_split;
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
+    bool                     use_mmap;
+    bool                     use_direct_io;
+    bool                     embeddings;
+    bool                     no_op_offload;
+    bool                     no_host;
+    int                      n_prompt;
+    int                      n_gen;
+    int                      n_depth;
+    std::string              test_time;
+    std::vector<uint64_t>    samples_ns;
+
+    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
+        cpu_info(get_cpu_info()),
+        gpu_info(get_gpu_info()) {
+
+        model_filename = inst.model;
+        char buf[128];
+        llama_model_desc(lmodel, buf, sizeof(buf));
+        model_type     = buf;
+        model_size     = llama_model_size(lmodel);
+        model_n_params = llama_model_n_params(lmodel);
+        n_batch        = inst.n_batch;
+        n_ubatch       = inst.n_ubatch;
+        n_threads      = inst.n_threads;
+        cpu_mask       = inst.cpu_mask;
+        cpu_strict     = inst.cpu_strict;
+        poll           = inst.poll;
+        type_k         = inst.type_k;
+        type_v         = inst.type_v;
+        n_gpu_layers   = inst.n_gpu_layers;
+        n_cpu_moe      = inst.n_cpu_moe;
+        split_mode     = inst.split_mode;
+        main_gpu       = inst.main_gpu;
+        no_kv_offload  = inst.no_kv_offload;
+        flash_attn     = inst.flash_attn;
+        devices        = inst.devices;
+        tensor_split   = inst.tensor_split;
+        tensor_buft_overrides = inst.tensor_buft_overrides;
+        use_mmap       = inst.use_mmap;
+        use_direct_io  = inst.use_direct_io;
+        embeddings     = inst.embeddings;
+        no_op_offload  = inst.no_op_offload;
+        no_host        = inst.no_host;
+        n_prompt       = inst.n_prompt;
+        n_gen          = inst.n_gen;
+        n_depth        = inst.n_depth;
+        // RFC 3339 date-time format
+        time_t t       = time(NULL);
+        std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
+        test_time = buf;
+
+        (void) ctx;
+    }
+
+    uint64_t avg_ns() const { return ::avg(samples_ns); }
+
+    uint64_t stdev_ns() const { return ::stdev(samples_ns); }
+
+    std::vector<double> get_ts() const {
+        int                 n_tokens = n_prompt + n_gen;
+        std::vector<double> ts;
+        std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
+                       [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
+        return ts;
+    }
+
+    double avg_ts() const { return ::avg(get_ts()); }
+
+    double stdev_ts() const { return ::stdev(get_ts()); }
+
+    static std::string get_backend() {
+        std::vector<std::string> backends;
+        bool                     rpc_used = false;
+        for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+            auto *      reg  = ggml_backend_reg_get(i);
+            std::string name = ggml_backend_reg_name(reg);
+            if (string_starts_with(name, "RPC")) {
+                if (ggml_backend_reg_dev_count(reg) > 0) {
+                    rpc_used = true;
+                }
+            } else {
+                if (name != "CPU") {
+                    backends.push_back(ggml_backend_reg_name(reg));
+                }
+            }
+        }
+        if (rpc_used) {
+            backends.push_back("RPC");
+        }
+        return backends.empty() ? "CPU" : join(backends, ",");
+    }
+
+    static const std::vector<std::string> & get_fields() {
+        static const std::vector<std::string> fields = {
+            "build_commit",   "build_number",   "cpu_info",      "gpu_info",       "backends",
+            "model_filename", "model_type",     "model_size",    "model_n_params", "n_batch",
+            "n_ubatch",       "n_threads",      "cpu_mask",      "cpu_strict",     "poll",
+            "type_k",         "type_v",         "n_gpu_layers",  "n_cpu_moe",      "split_mode",
+            "main_gpu",       "no_kv_offload",  "flash_attn",    "devices",        "tensor_split",
+            "tensor_buft_overrides",            "use_mmap",      "use_direct_io",  "embeddings",
+            "no_op_offload",  "no_host",        "n_prompt",      "n_gen",          "n_depth",
+            "test_time",      "avg_ns",         "stddev_ns",     "avg_ts",         "stddev_ts"
+        };
+        return fields;
+    }
+
+    enum field_type { STRING, BOOL, INT, FLOAT };
+
+    static field_type get_field_type(const std::string & field) {
+        if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
+            field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
+            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" ||
+            field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") {
+            return INT;
+        }
+        if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
+            field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") {
+            return BOOL;
+        }
+        if (field == "avg_ts" || field == "stddev_ts") {
+            return FLOAT;
+        }
+        return STRING;
+    }
+
+    std::vector<std::string> get_values() const {
+        std::string tensor_split_str;
+        std::string tensor_buft_overrides_str;
+        int         max_nonzero = 0;
+        for (size_t i = 0; i < llama_max_devices(); i++) {
+            if (tensor_split[i] > 0) {
+                max_nonzero = i;
+            }
+        }
+        for (int i = 0; i <= max_nonzero; i++) {
+            char buf[32];
+            snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]);
+            tensor_split_str += buf;
+            if (i < max_nonzero) {
+                tensor_split_str += "/";
+            }
+        }
+        if (tensor_buft_overrides.size() == 1) {
+            // Last element of tensor_buft_overrides is always a null pattern
+            // so if it is only one element long, it must be a null pattern.
+            GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr);
+            tensor_buft_overrides_str += "none";
+        } else {
+            for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) {
+                // Last element of tensor_buft_overrides is always a null pattern
+                if (tensor_buft_overrides[i].pattern == nullptr) {
+                    tensor_buft_overrides_str += "none";
+                } else {
+                    tensor_buft_overrides_str += tensor_buft_overrides[i].pattern;
+                    tensor_buft_overrides_str += "=";
+                    tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft);
+                }
+                if (i + 2 < tensor_buft_overrides.size()) {
+                    tensor_buft_overrides_str += ";";
+                }
+            }
+        }
+        std::vector<std::string> values = { build_commit,
+                                            std::to_string(build_number),
+                                            cpu_info,
+                                            gpu_info,
+                                            get_backend(),
+                                            model_filename,
+                                            model_type,
+                                            std::to_string(model_size),
+                                            std::to_string(model_n_params),
+                                            std::to_string(n_batch),
+                                            std::to_string(n_ubatch),
+                                            std::to_string(n_threads),
+                                            cpu_mask,
+                                            std::to_string(cpu_strict),
+                                            std::to_string(poll),
+                                            ggml_type_name(type_k),
+                                            ggml_type_name(type_v),
+                                            std::to_string(n_gpu_layers),
+                                            std::to_string(n_cpu_moe),
+                                            split_mode_str(split_mode),
+                                            std::to_string(main_gpu),
+                                            std::to_string(no_kv_offload),
+                                            std::to_string(flash_attn),
+                                            devices_to_string(devices),
+                                            tensor_split_str,
+                                            tensor_buft_overrides_str,
+                                            std::to_string(use_mmap),
+                                            std::to_string(use_direct_io),
+                                            std::to_string(embeddings),
+                                            std::to_string(no_op_offload),
+                                            std::to_string(no_host),
+                                            std::to_string(n_prompt),
+                                            std::to_string(n_gen),
+                                            std::to_string(n_depth),
+                                            test_time,
+                                            std::to_string(avg_ns()),
+                                            std::to_string(stdev_ns()),
+                                            std::to_string(avg_ts()),
+                                            std::to_string(stdev_ts()) };
+        return values;
+    }
+
+    std::map<std::string, std::string> get_map() const {
+        std::map<std::string, std::string> map;
+        auto                               fields = get_fields();
+        auto                               values = get_values();
+        std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()),
+                       std::make_pair<const std::string &, const std::string &>);
+        return map;
+    }
+};
+
+const std::string test::build_commit = LLAMA_COMMIT;
+const int         test::build_number = LLAMA_BUILD_NUMBER;
+
+struct printer {
+    virtual ~printer() {}
+
+    FILE * fout;
+
+    virtual void print_header(const cmd_params & params) { (void) params; }
+
+    virtual void print_test(const test & t) = 0;
+
+    virtual void print_footer() {}
+};
+
+struct csv_printer : public printer {
+    static std::string escape_csv(const std::string & field) {
+        std::string escaped = "\"";
+        for (auto c : field) {
+            if (c == '"') {
+                escaped += "\"";
+            }
+            escaped += c;
+        }
+        escaped += "\"";
+        return escaped;
+    }
+
+    void print_header(const cmd_params & params) override {
+        std::vector<std::string> fields = test::get_fields();
+        fprintf(fout, "%s\n", join(fields, ",").c_str());
+        (void) params;
+    }
+
+    void print_test(const test & t) override {
+        std::vector<std::string> values = t.get_values();
+        std::transform(values.begin(), values.end(), values.begin(), escape_csv);
+        fprintf(fout, "%s\n", join(values, ",").c_str());
+    }
+};
+
+static std::string escape_json(const std::string & value) {
+    std::string escaped;
+    for (auto c : value) {
+        if (c == '"') {
+            escaped += "\\\"";
+        } else if (c == '\\') {
+            escaped += "\\\\";
+        } else if (c <= 0x1f) {
+            char buf[8];
+            snprintf(buf, sizeof(buf), "\\u%04x", c);
+            escaped += buf;
+        } else {
+            escaped += c;
+        }
+    }
+    return escaped;
+}
+
+static std::string format_json_value(const std::string & field, const std::string & value) {
+    switch (test::get_field_type(field)) {
+        case test::STRING:
+            return "\"" + escape_json(value) + "\"";
+        case test::BOOL:
+            return value == "0" ? "false" : "true";
+        default:
+            return value;
+    }
+}
+
+struct json_printer : public printer {
+    bool first = true;
+
+    void print_header(const cmd_params & params) override {
+        fprintf(fout, "[\n");
+        (void) params;
+    }
+
+    void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
+        assert(fields.size() == values.size());
+        for (size_t i = 0; i < fields.size(); i++) {
+            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(),
+                    format_json_value(fields.at(i), values.at(i)).c_str());
+        }
+    }
+
+    void print_test(const test & t) override {
+        if (first) {
+            first = false;
+        } else {
+            fprintf(fout, ",\n");
+        }
+        fprintf(fout, "  {\n");
+        print_fields(test::get_fields(), t.get_values());
+        fprintf(fout, "    \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
+        fprintf(fout, "    \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
+        fprintf(fout, "  }");
+        fflush(fout);
+    }
+
+    void print_footer() override { fprintf(fout, "\n]\n"); }
+};
+
+struct jsonl_printer : public printer {
+    void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
+        assert(fields.size() == values.size());
+        for (size_t i = 0; i < fields.size(); i++) {
+            fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
+        }
+    }
+
+    void print_test(const test & t) override {
+        fprintf(fout, "{");
+        print_fields(test::get_fields(), t.get_values());
+        fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
+        fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
+        fprintf(fout, "}\n");
+        fflush(fout);
+    }
+};
+
+struct markdown_printer : public printer {
+    std::vector<std::string> fields;
+
+    static int get_field_width(const std::string & field) {
+        if (field == "model") {
+            return -30;
+        }
+        if (field == "t/s") {
+            return 20;
+        }
+        if (field == "size" || field == "params") {
+            return 10;
+        }
+        if (field == "n_gpu_layers") {
+            return 3;
+        }
+        if (field == "n_threads") {
+            return 7;
+        }
+        if (field == "n_batch") {
+            return 7;
+        }
+        if (field == "n_ubatch") {
+            return 8;
+        }
+        if (field == "type_k" || field == "type_v") {
+            return 6;
+        }
+        if (field == "split_mode") {
+            return 5;
+        }
+        if (field == "flash_attn") {
+            return 2;
+        }
+        if (field == "devices") {
+            return -12;
+        }
+        if (field == "use_mmap") {
+            return 4;
+        }
+        if (field == "use_direct_io") {
+            return 3;
+        }
+        if (field == "test") {
+            return 15;
+        }
+        if (field == "no_op_offload") {
+            return 4;
+        }
+        if (field == "no_host") {
+            return 4;
+        }
+
+        int width = std::max((int) field.length(), 10);
+
+        if (test::get_field_type(field) == test::STRING) {
+            return -width;
+        }
+        return width;
+    }
+
+    static std::string get_field_display_name(const std::string & field) {
+        if (field == "n_gpu_layers") {
+            return "ngl";
+        }
+        if (field == "split_mode") {
+            return "sm";
+        }
+        if (field == "n_threads") {
+            return "threads";
+        }
+        if (field == "no_kv_offload") {
+            return "nkvo";
+        }
+        if (field == "flash_attn") {
+            return "fa";
+        }
+        if (field == "use_mmap") {
+            return "mmap";
+        }
+        if (field == "use_direct_io") {
+            return "dio";
+        }
+        if (field == "embeddings") {
+            return "embd";
+        }
+        if (field == "no_op_offload") {
+            return "nopo";
+        }
+        if (field == "no_host") {
+            return "noh";
+        }
+        if (field == "devices") {
+            return "dev";
+        }
+        if (field == "tensor_split") {
+            return "ts";
+        }
+        if (field == "tensor_buft_overrides") {
+            return "ot";
+        }
+        return field;
+    }
+
+    void print_header(const cmd_params & params) override {
+        // select fields to print
+        fields.emplace_back("model");
+        fields.emplace_back("size");
+        fields.emplace_back("params");
+        fields.emplace_back("backend");
+        bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
+                              test::get_backend().find("BLAS") != std::string::npos ||
+                              test::get_backend().find("ZenDNN") != std::string::npos;
+        if (!is_cpu_backend) {
+            fields.emplace_back("n_gpu_layers");
+        }
+        if (params.n_cpu_moe.size() > 1) {
+            fields.emplace_back("n_cpu_moe");
+        }
+        if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
+            fields.emplace_back("n_threads");
+        }
+        if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
+            fields.emplace_back("cpu_mask");
+        }
+        if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
+            fields.emplace_back("cpu_strict");
+        }
+        if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
+            fields.emplace_back("poll");
+        }
+        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
+            fields.emplace_back("n_batch");
+        }
+        if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) {
+            fields.emplace_back("n_ubatch");
+        }
+        if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
+            fields.emplace_back("type_k");
+        }
+        if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
+            fields.emplace_back("type_v");
+        }
+        if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
+            fields.emplace_back("main_gpu");
+        }
+        if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
+            fields.emplace_back("split_mode");
+        }
+        if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
+            fields.emplace_back("no_kv_offload");
+        }
+        if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
+            fields.emplace_back("flash_attn");
+        }
+        if (params.devices.size() > 1 || params.devices != cmd_params_defaults.devices) {
+            fields.emplace_back("devices");
+        }
+        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
+            fields.emplace_back("tensor_split");
+        }
+        if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
+            fields.emplace_back("tensor_buft_overrides");
+        }
+        if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
+            fields.emplace_back("use_mmap");
+        }
+        if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
+            fields.emplace_back("use_direct_io");
+        }
+        if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
+            fields.emplace_back("embeddings");
+        }
+        if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
+            fields.emplace_back("no_op_offload");
+        }
+        if (params.no_host.size() > 1 || params.no_host != cmd_params_defaults.no_host) {
+            fields.emplace_back("no_host");
+        }
+        fields.emplace_back("test");
+        fields.emplace_back("t/s");
+
+        fprintf(fout, "|");
+        for (const auto & field : fields) {
+            fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
+        }
+        fprintf(fout, "\n");
+        fprintf(fout, "|");
+        for (const auto & field : fields) {
+            int width = get_field_width(field);
+            fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-");
+        }
+        fprintf(fout, "\n");
+    }
+
+    void print_test(const test & t) override {
+        std::map<std::string, std::string> vmap = t.get_map();
+
+        fprintf(fout, "|");
+        for (const auto & field : fields) {
+            std::string value;
+            char        buf[128];
+            if (field == "model") {
+                value = t.model_type;
+            } else if (field == "size") {
+                if (t.model_size < 1024 * 1024 * 1024) {
+                    snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
+                } else {
+                    snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
+                }
+                value = buf;
+            } else if (field == "params") {
+                if (t.model_n_params < 1000 * 1000 * 1000) {
+                    snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
+                } else {
+                    snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
+                }
+                value = buf;
+            } else if (field == "backend") {
+                value = test::get_backend();
+            } else if (field == "test") {
+                if (t.n_prompt > 0 && t.n_gen == 0) {
+                    snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
+                } else if (t.n_gen > 0 && t.n_prompt == 0) {
+                    snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
+                } else {
+                    snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
+                }
+                if (t.n_depth > 0) {
+                    int len = strlen(buf);
+                    snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
+                }
+                value = buf;
+            } else if (field == "t/s") {
+                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
+                value = buf;
+            } else if (vmap.find(field) != vmap.end()) {
+                value = vmap.at(field);
+            } else {
+                assert(false);
+                exit(1);
+            }
+
+            int width = get_field_width(field);
+            if (field == "t/s") {
+                // HACK: the utf-8 character is 2 bytes
+                width += 1;
+            }
+            fprintf(fout, " %*s |", width, value.c_str());
+        }
+        fprintf(fout, "\n");
+    }
+
+    void print_footer() override {
+        fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
+    }
+};
+
+struct sql_printer : public printer {
+    static std::string get_sql_field_type(const std::string & field) {
+        switch (test::get_field_type(field)) {
+            case test::STRING:
+                return "TEXT";
+            case test::BOOL:
+            case test::INT:
+                return "INTEGER";
+            case test::FLOAT:
+                return "REAL";
+            default:
+                assert(false);
+                exit(1);
+        }
+    }
+
+    void print_header(const cmd_params & params) override {
+        std::vector<std::string> fields = test::get_fields();
+        fprintf(fout, "CREATE TABLE IF NOT EXISTS llama_bench (\n");
+        for (size_t i = 0; i < fields.size(); i++) {
+            fprintf(fout, "  %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
+                    i < fields.size() - 1 ? "," : "");
+        }
+        fprintf(fout, ");\n");
+        fprintf(fout, "\n");
+        (void) params;
+    }
+
+    void print_test(const test & t) override {
+        fprintf(fout, "INSERT INTO llama_bench (%s) ", join(test::get_fields(), ", ").c_str());
+        fprintf(fout, "VALUES (");
+        std::vector<std::string> values = t.get_values();
+        for (size_t i = 0; i < values.size(); i++) {
+            fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : "");
+        }
+        fprintf(fout, ");\n");
+    }
+};
+
+struct ctx_state {
+    int depth = 0; // in tokens
+
+    std::vector<uint8_t> buf; // the llama_context state buffer
+};
+
+static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
+    llama_set_n_threads(ctx, n_threads, n_threads);
+
+    const llama_model * model   = llama_get_model(ctx);
+    const llama_vocab * vocab   = llama_model_get_vocab(model);
+    const int32_t       n_vocab = llama_vocab_n_tokens(vocab);
+
+    std::vector<llama_token> tokens(n_batch);
+
+    int n_processed = 0;
+
+    while (n_processed < n_prompt) {
+        int n_tokens = std::min(n_prompt - n_processed, n_batch);
+        tokens[0]    = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
+        for (int i = 1; i < n_tokens; i++) {
+            tokens[i] = std::rand() % n_vocab;
+        }
+        int res = llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
+        if (res != 0) {
+            fprintf(stderr, "%s: failed to decode prompt batch, res = %d\n", __func__, res);
+            return false;
+        }
+        n_processed += n_tokens;
+    }
+
+    llama_synchronize(ctx);
+    return true;
+}
+
+static bool test_gen(llama_context * ctx, int n_gen, int n_threads) {
+    llama_set_n_threads(ctx, n_threads, n_threads);
+
+    const llama_model * model   = llama_get_model(ctx);
+    const llama_vocab * vocab   = llama_model_get_vocab(model);
+    const int32_t       n_vocab = llama_vocab_n_tokens(vocab);
+
+    llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
+
+    for (int i = 0; i < n_gen; i++) {
+        int res = llama_decode(ctx, llama_batch_get_one(&token, 1));
+        if (res != 0) {
+            fprintf(stderr, "%s: failed to decode generation batch, res = %d\n", __func__, res);
+            return false;
+        }
+        llama_synchronize(ctx);
+        token = std::rand() % n_vocab;
+    }
+    return true;
+}
+
+static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) text;
+    (void) user_data;
+}
+
+static std::unique_ptr<printer> create_printer(output_formats format) {
+    switch (format) {
+        case NONE:
+            return nullptr;
+        case CSV:
+            return std::unique_ptr<printer>(new csv_printer());
+        case JSON:
+            return std::unique_ptr<printer>(new json_printer());
+        case JSONL:
+            return std::unique_ptr<printer>(new jsonl_printer());
+        case MARKDOWN:
+            return std::unique_ptr<printer>(new markdown_printer());
+        case SQL:
+            return std::unique_ptr<printer>(new sql_printer());
+    }
+    GGML_ABORT("fatal error");
+}
+
+int main(int argc, char ** argv) {
+    // try to set locale for unicode characters in markdown
+    setlocale(LC_CTYPE, ".UTF-8");
+
+#if !defined(NDEBUG)
+    fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
+#endif
+
+#if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__))
+    fprintf(stderr, "warning: debug build, performance may be affected\n");
+#endif
+
+#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
+    fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
+#endif
+
+    // initialize backends
+    ggml_backend_load_all();
+
+    cmd_params params = parse_cmd_params(argc, argv);
+
+    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (!cpu_dev) {
+        fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
+        return 1;
+    }
+    auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
+    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
+
+    // initialize llama.cpp
+    if (!params.verbose) {
+        llama_log_set(llama_null_log_callback, NULL);
+    }
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    if (!set_process_priority(params.prio)) {
+        fprintf(stderr, "%s: error: failed to set process priority\n", __func__);
+        return 1;
+    }
+
+    // initialize printer
+    std::unique_ptr<printer> p     = create_printer(params.output_format);
+    std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
+
+    if (p) {
+        p->fout = stdout;
+        p->print_header(params);
+    }
+
+    if (p_err) {
+        p_err->fout = stderr;
+        p_err->print_header(params);
+    }
+
+    std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
+
+    llama_model *               lmodel    = nullptr;
+    const cmd_params_instance * prev_inst = nullptr;
+
+    // store the llama_context state at the previous depth that we performed a test
+    // ref: https://github.com/ggml-org/llama.cpp/pull/16944#issuecomment-3478151721
+    ctx_state cstate;
+
+    int  params_idx   = 0;
+    auto params_count = params_instances.size();
+    for (const auto & inst : params_instances) {
+        params_idx++;
+        if (params.progress) {
+            fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
+        }
+        // keep the same model between tests when possible
+        if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
+            if (lmodel) {
+                llama_model_free(lmodel);
+            }
+
+            lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
+            if (lmodel == NULL) {
+                fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
+                return 1;
+            }
+            prev_inst = &inst;
+        }
+
+        llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams());
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
+            llama_model_free(lmodel);
+            return 1;
+        }
+
+        test t(inst, lmodel, ctx);
+
+        llama_memory_clear(llama_get_memory(ctx), false);
+
+        // cool off before the test
+        if (params.delay) {
+            std::this_thread::sleep_for(std::chrono::seconds(params.delay));
+        }
+
+        struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
+        if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
+            fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
+            llama_free(ctx);
+            llama_model_free(lmodel);
+            exit(1);
+        }
+        tpp.strict_cpu = t.cpu_strict;
+        tpp.poll       = t.poll;
+        tpp.prio       = params.prio;
+
+        struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
+        if (!threadpool) {
+            fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+            llama_free(ctx);
+            llama_model_free(lmodel);
+            exit(1);
+        }
+
+        llama_attach_threadpool(ctx, threadpool, NULL);
+
+        // warmup run
+        if (!params.no_warmup) {
+            if (t.n_prompt > 0) {
+                if (params.progress) {
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
+                }
+                //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
+                bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
+                if (!res) {
+                    fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(lmodel);
+                    exit(1);
+                }
+            }
+            if (t.n_gen > 0) {
+                if (params.progress) {
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
+                }
+                bool res = test_gen(ctx, 1, t.n_threads);
+                if (!res) {
+                    fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(lmodel);
+                    exit(1);
+                }
+            }
+        }
+
+        for (int i = 0; i < params.reps; i++) {
+            llama_memory_clear(llama_get_memory(ctx), false);
+
+            if (t.n_depth > 0) {
+                bool is_cached = t.n_depth == cstate.depth;
+
+                if (is_cached) {
+                    // if previously we have computed at this depth, just restore the state
+                    const size_t ret = llama_state_seq_set_data(ctx, cstate.buf.data(), cstate.buf.size(), 0);
+                    if (ret == 0) {
+                        // if the old state is incompatible with the current context - reprocess from scratch
+                        is_cached = false;
+                    }
+                }
+
+                if (!is_cached) {
+                    if (params.progress) {
+                        fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
+                                i + 1, params.reps);
+                    }
+                    bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
+                    if (!res) {
+                        fprintf(stderr, "%s: error: failed to run depth\n", __func__);
+                        llama_free(ctx);
+                        llama_model_free(lmodel);
+                        exit(1);
+                    }
+
+                    // store the context state for reuse in later runs
+                    cstate.depth = t.n_depth;
+                    cstate.buf.resize(llama_state_seq_get_size(ctx, 0));
+                    llama_state_seq_get_data(ctx, cstate.buf.data(), cstate.buf.size(), 0);
+                } else {
+                    if (params.progress) {
+                        fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d (cached)\n", params_idx, params_count,
+                                i + 1, params.reps);
+                    }
+                }
+            }
+
+            uint64_t t_start = get_time_ns();
+
+            if (t.n_prompt > 0) {
+                if (params.progress) {
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
+                            i + 1, params.reps);
+                }
+                bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
+                if (!res) {
+                    fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(lmodel);
+                    exit(1);
+                }
+            }
+            if (t.n_gen > 0) {
+                if (params.progress) {
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
+                            i + 1, params.reps);
+                }
+                bool res = test_gen(ctx, t.n_gen, t.n_threads);
+                if (!res) {
+                    fprintf(stderr, "%s: error: failed to run gen\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(lmodel);
+                    exit(1);
+                }
+            }
+
+            uint64_t t_ns = get_time_ns() - t_start;
+            t.samples_ns.push_back(t_ns);
+        }
+
+        if (p) {
+            p->print_test(t);
+            fflush(p->fout);
+        }
+
+        if (p_err) {
+            p_err->print_test(t);
+            fflush(p_err->fout);
+        }
+
+        llama_perf_context_print(ctx);
+
+        llama_free(ctx);
+
+        ggml_threadpool_free_fn(threadpool);
+    }
+
+    llama_model_free(lmodel);
+
+    if (p) {
+        p->print_footer();
+    }
+
+    if (p_err) {
+        p_err->print_footer();
+    }
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/llama.cpp/tools/mtmd/CMakeLists.txt b/llama.cpp/tools/mtmd/CMakeLists.txt
new file mode 100644
index 0000000..02d71f2
--- /dev/null
+++ b/llama.cpp/tools/mtmd/CMakeLists.txt
@@ -0,0 +1,96 @@
+# mtmd
+
+find_package(Threads REQUIRED)
+
+add_library(mtmd
+            mtmd.cpp
+            mtmd-audio.cpp
+            mtmd.h
+            mtmd-helper.cpp
+            mtmd-helper.h
+            clip.cpp
+            clip.h
+            clip-impl.h
+            clip-model.h
+            clip-graph.h
+            models/models.h
+            models/cogvlm.cpp
+            models/conformer.cpp
+            models/glm4v.cpp
+            models/internvl.cpp
+            models/kimivl.cpp
+            models/kimik25.cpp
+            models/llama4.cpp
+            models/llava.cpp
+            models/minicpmv.cpp
+            models/pixtral.cpp
+            models/qwen2vl.cpp
+            models/qwen3vl.cpp
+            models/siglip.cpp
+            models/whisper-enc.cpp
+            models/mobilenetv5.cpp
+            models/youtuvl.cpp
+            )
+
+set_target_properties(mtmd PROPERTIES
+    VERSION ${LLAMA_INSTALL_VERSION}
+    SOVERSION 0
+    MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
+)
+
+target_link_libraries     (mtmd PUBLIC ggml llama)
+target_link_libraries     (mtmd PRIVATE Threads::Threads)
+target_include_directories(mtmd PUBLIC  .)
+target_include_directories(mtmd PRIVATE ../..)
+target_include_directories(mtmd PRIVATE ../../vendor)
+target_compile_features   (mtmd PRIVATE cxx_std_17)
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
+    target_compile_definitions(mtmd PUBLIC  LLAMA_SHARED)
+endif()
+
+set(MTMD_PUBLIC_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
+    )
+
+set_target_properties(mtmd
+    PROPERTIES
+    PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")
+
+install(TARGETS mtmd LIBRARY PUBLIC_HEADER)
+
+if (NOT MSVC)
+    # for stb_image.h and miniaudio.h
+    target_compile_options(mtmd PRIVATE -Wno-cast-qual)
+endif()
+
+if (TARGET BUILD_INFO)
+    add_dependencies(mtmd        BUILD_INFO)
+    add_dependencies(mtmd-helper BUILD_INFO)
+endif()
+
+# if mtmd is linked against common, we throw an error
+if (TARGET mtmd)
+    get_target_property(libs mtmd LINK_LIBRARIES)
+    if (libs AND "common" IN_LIST libs)
+        message(FATAL_ERROR "mtmd is designed to be a public library.\n"
+                            "It must not link against common")
+    endif()
+endif()
+
+add_executable(llama-llava-cli    deprecation-warning.cpp)
+add_executable(llama-gemma3-cli   deprecation-warning.cpp)
+add_executable(llama-minicpmv-cli deprecation-warning.cpp)
+add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)
+
+set(TARGET llama-mtmd-cli)
+add_executable         (${TARGET} mtmd-cli.cpp)
+set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
+target_link_libraries  (${TARGET} PRIVATE common mtmd Threads::Threads)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/llama.cpp/tools/mtmd/README.md b/llama.cpp/tools/mtmd/README.md
new file mode 100644
index 0000000..ef31d19
--- /dev/null
+++ b/llama.cpp/tools/mtmd/README.md
@@ -0,0 +1,63 @@
+# Multimodal Support in llama.cpp
+
+This directory provides multimodal capabilities for `llama.cpp`. Initially intended as a showcase for running LLaVA models, its scope has expanded significantly over time to include various other vision-capable models. As a result, LLaVA is no longer the only multimodal architecture supported.
+
+> [!IMPORTANT]
+>
+> Multimodal support can be viewed as a sub-project within `llama.cpp`. It is under **very heavy development**, and **breaking changes are expected**.
+
+The naming and structure related to multimodal support have evolved, which might cause some confusion. Here's a brief timeline to clarify:
+
+- [#3436](https://github.com/ggml-org/llama.cpp/pull/3436): Initial support for LLaVA 1.5 was added, introducing `llava.cpp` and `clip.cpp`. The `llava-cli` binary was created for model interaction.
+- [#4954](https://github.com/ggml-org/llama.cpp/pull/4954): Support for MobileVLM was added, becoming the second vision model supported. This built upon the existing `llava.cpp`, `clip.cpp`, and `llava-cli` infrastructure.
+- **Expansion & Fragmentation:** Many new models were subsequently added (e.g., [#7599](https://github.com/ggml-org/llama.cpp/pull/7599), [#10361](https://github.com/ggml-org/llama.cpp/pull/10361), [#12344](https://github.com/ggml-org/llama.cpp/pull/12344), and others). However, `llava-cli` lacked support for the increasingly complex chat templates required by these models. This led to the creation of model-specific binaries like `qwen2vl-cli`, `minicpmv-cli`, and `gemma3-cli`. While functional, this proliferation of command-line tools became confusing for users.
+- [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs.
+- [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`.
+
+## Pre-quantized models
+
+See the list of pre-quantized model [here](../../docs/multimodal.md)
+
+## How it works and what is `mmproj`?
+
+Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model.
+
+This approach keeps the multimodal components distinct from the core `libllama` library. Separating these allows for faster, independent development cycles. While many modern vision models are based on Vision Transformers (ViTs), their specific pre-processing and projection steps can vary significantly. Integrating this diverse complexity directly into `libllama` is currently challenging.
+
+Consequently, running a multimodal model typically requires two GGUF files:
+1.  The standard language model file.
+2.  A corresponding **multimodal projector (`mmproj`)** file, which handles the image encoding and projection.
+
+## What is `libmtmd`?
+
+As outlined in the history, `libmtmd` is the modern library designed to replace the original `llava.cpp` implementation for handling multimodal inputs.
+
+Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advantages:
+- **Unified Interface:** Aims to consolidate interaction for various multimodal models.
+- **Improved UX/DX:** Features a more intuitive API, inspired by the `Processor` class in the Hugging Face `transformers` library.
+- **Flexibility:** Designed to support multiple input types (text, audio, images) while respecting the wide variety of chat templates used by different models.
+
+## How to obtain `mmproj`
+
+Multimodal projector (`mmproj`) files are specific to each model architecture.
+
+For the following models, you can use `convert_hf_to_gguf.py` with `--mmproj` flag to get the `mmproj` file:
+- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) ; See the guide [here](../../docs/multimodal/gemma3.md) - Note: 1B variant does not have vision support
+- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
+- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
+- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
+- Qwen 2 VL and Qwen 2.5 VL (from [Qwen](https://huggingface.co/Qwen))
+- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)
+- InternVL 2.5 and InternVL 3 from [OpenGVLab](https://huggingface.co/OpenGVLab) (note: we don't support conversion of `InternVL3-*-hf` model, only non-HF version is supported ; `InternLM2Model` **text** model is not supported)
+
+For older models, please refer to the relevant guide for instructions on how to obtain or create them:
+
+NOTE: conversion scripts are located under `tools/mtmd/legacy-models`
+
+- [LLaVA](../../docs/multimodal/llava.md)
+- [MobileVLM](../../docs/multimodal/MobileVLM.md)
+- [GLM-Edge](../../docs/multimodal/glmedge.md)
+- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md)
+- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md)
+- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
+- [IBM Granite Vision](../../docs/multimodal/granitevision.md)
diff --git a/llama.cpp/tools/mtmd/clip-graph.h b/llama.cpp/tools/mtmd/clip-graph.h
new file mode 100644
index 0000000..4c7f750
--- /dev/null
+++ b/llama.cpp/tools/mtmd/clip-graph.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpp.h"
+#include "clip.h"
+#include "clip-impl.h"
+#include "clip-model.h"
+
+#include <vector>
+#include <functional>
+
+#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
+
+struct clip_graph {
+    const clip_model & model;
+    const clip_hparams & hparams;
+    projector_type proj_type;
+
+    // we only support single image per batch
+    const clip_image_f32 & img;
+
+    const int patch_size;
+    const int n_patches_x;
+    const int n_patches_y;
+    const int n_patches;
+    const int n_embd;
+    const int n_head;
+    const int d_head;
+    const int n_layer;
+    const int n_mmproj_embd;
+    const float eps;
+    const float kq_scale;
+    const clip_flash_attn_type flash_attn_type;
+
+    ggml_context_ptr ctx0_ptr;
+    ggml_context * ctx0;
+    ggml_cgraph * gf;
+
+    clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
+
+    virtual ~clip_graph() = default;
+    virtual ggml_cgraph * build() = 0;
+
+    //
+    // utility functions
+    //
+    void cb(ggml_tensor * cur0, const char * name, int il) const;
+
+    // siglip2 naflex
+    ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE);
+
+    // build vision transformer (ViT) cgraph
+    // this function should cover most of the models
+    // if your model has specific features, you should probably duplicate this function
+    ggml_tensor * build_vit(
+                ggml_tensor * inp,
+                int64_t n_pos,
+                norm_type norm_t,
+                ffn_op_type ffn_t,
+                ggml_tensor * learned_pos_embd,
+                std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
+
+    // build the input after conv2d (inp_raw --> patches)
+    // returns tensor with shape [n_embd, n_patches]
+    ggml_tensor * build_inp();
+
+    ggml_tensor * build_inp_raw(int channels = 3);
+
+    ggml_tensor * build_norm(
+            ggml_tensor * cur,
+            ggml_tensor * mw,
+            ggml_tensor * mb,
+            norm_type type,
+            float norm_eps,
+            int il) const;
+
+    ggml_tensor * build_ffn(
+            ggml_tensor * cur,
+            ggml_tensor * up,
+            ggml_tensor * up_b,
+            ggml_tensor * gate,
+            ggml_tensor * gate_b,
+            ggml_tensor * down,
+            ggml_tensor * down_b,
+            ffn_op_type type_op,
+            int il) const;
+
+    ggml_tensor * build_attn(
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur,
+            ggml_tensor * k_cur,
+            ggml_tensor * v_cur,
+            ggml_tensor * kq_mask,
+            float kq_scale,
+            int il) const;
+
+    // implementation of the 2D RoPE without adding a new op in ggml
+    // this is not efficient (use double the memory), but works on all backends
+    // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
+    ggml_tensor * build_rope_2d(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        ggml_tensor * pos_a, // first half
+        ggml_tensor * pos_b, // second half
+        const float freq_base,
+        const bool interleave_freq
+    );
+
+    // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
+    // support dynamic resolution
+    ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
+
+    // Generic function to stack frames for audio processing
+    // Abstracts out the StackAudioFrames logic used by ultravox
+    ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed);
+};
diff --git a/llama.cpp/tools/mtmd/clip-impl.h b/llama.cpp/tools/mtmd/clip-impl.h
new file mode 100644
index 0000000..3bc93ea
--- /dev/null
+++ b/llama.cpp/tools/mtmd/clip-impl.h
@@ -0,0 +1,582 @@
+#pragma once
+
+#include "ggml.h"
+#include "gguf.h"
+#include "clip.h"
+
+#include <climits>
+#include <cstdarg>
+#include <cinttypes>
+#include <string>
+#include <map>
+#include <sstream>
+#include <vector>
+#include <memory>
+
+// Internal header for clip.cpp
+
+#define MTMD_INTERNAL_HEADER
+
+#define KEY_FTYPE               "general.file_type"
+#define KEY_NAME                "general.name"
+#define KEY_DESCRIPTION         "general.description"
+#define KEY_PROJ_TYPE           "clip.projector_type"
+#define KEY_HAS_AUDIO_ENC       "clip.has_audio_encoder"
+#define KEY_HAS_VISION_ENC      "clip.has_vision_encoder"
+#define KEY_USE_GELU            "clip.use_gelu"
+#define KEY_USE_SILU            "clip.use_silu"
+
+#define KEY_N_EMBD              "clip.%s.embedding_length"
+#define KEY_N_FF                "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK             "clip.%s.block_count"
+#define KEY_PROJ_DIM            "clip.%s.projection_dim"
+#define KEY_N_HEAD              "clip.%s.attention.head_count"
+#define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
+
+// vision-specific
+#define KEY_VISION_PROJ_TYPE    "clip.vision.projector_type" // for models with mixed modalities
+#define KEY_IMAGE_SIZE          "clip.vision.image_size"
+#define KEY_IMAGE_MIN_PIXELS    "clip.vision.image_min_pixels"
+#define KEY_IMAGE_MAX_PIXELS    "clip.vision.image_max_pixels"
+#define KEY_PREPROC_IMAGE_SIZE  "clip.vision.preproc_image_size"
+#define KEY_PATCH_SIZE          "clip.vision.patch_size"
+#define KEY_IMAGE_MEAN          "clip.vision.image_mean"
+#define KEY_IMAGE_STD           "clip.vision.image_std"
+#define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
+#define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
+#define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"
+#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
+
+#define KEY_MM_PATCH_MERGE_TYPE    "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS   "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_CROP_RESOLUTION  "clip.vision.image_crop_resolution"
+#define KEY_WIN_ATTN_PATTERN       "clip.vision.n_wa_pattern"
+#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
+#define KEY_ATTN_WINDOW_SIZE       "clip.vision.window_size"
+#define KEY_MINICPMV_VERSION       "clip.minicpmv_version"
+#define KEY_MINICPMV_QUERY_NUM     "clip.minicpmv_query_num"
+
+// audio-specific
+#define KEY_AUDIO_PROJ_TYPE     "clip.audio.projector_type" // for models with mixed modalities
+#define KEY_A_NUM_MEL_BINS      "clip.audio.num_mel_bins"
+#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
+
+
+//
+// tensor name constants
+//
+
+#define TN_POS_EMBD        "%s.position_embd.weight"
+#define TN_CLASS_EMBD      "v.class_embd"
+#define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
+#define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
+#define TN_PATCH_BIAS      "v.patch_embd.bias"
+#define TN_NORM_EMBD       "v.norm_embd.%s"
+#define TN_ATTN_QKV        "%s.blk.%d.attn_qkv.%s"
+#define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
+#define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
+#define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
+#define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s"
+#define TN_ATTN_K_NORM     "%s.blk.%d.attn_k_norm.%s"
+#define TN_ATTN_Q_NORM     "%s.blk.%d.attn_q_norm.%s"
+#define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s"
+#define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
+#define TN_FFN_UP          "%s.blk.%d.ffn_up.%s"
+#define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
+#define TN_LN_1            "%s.blk.%d.ln1.%s" // layer norm
+#define TN_LN_2            "%s.blk.%d.ln2.%s" // layer norm
+#define TN_LS_1            "%s.blk.%d.ls1.%s" // layer scale
+#define TN_LS_2            "%s.blk.%d.ls2.%s" // layer scale
+#define TN_LN_PRE          "%s.pre_ln.%s"
+#define TN_LN_POST         "%s.post_ln.%s"
+#define TN_LLAVA_PROJ      "mm.%d.%s"
+#define TN_MM_UP           "mm.up.%s"
+#define TN_MM_GATE         "mm.gate.%s"
+#define TN_MM_DOWN         "mm.down.%s"
+#define TN_MM_POST_NORM    "mm.post_norm.%s"
+#define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s"
+#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+#define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
+#define TN_IMAGE_NEWLINE   "model.image_newline"
+#define TN_MM_INP_NORM     "mm.input_norm.weight"
+#define TN_MM_INP_NORM_B   "mm.input_norm.bias"
+#define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
+#define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
+#define TN_MM_PROJECTOR    "mm.model.fc.weight"         // idefics3
+#define TN_MM_PATCH_MERGER "mm.patch_merger.%s"         // mistral small 3.1, glm4v
+#define TN_TOK_IMG_BREAK   "v.token_embd.img_break"     // pixtral
+#define TN_TOK_GLM_BOI     "adapter.boi"                // glm-edge (these embeddings are not in text model)
+#define TN_TOK_GLM_EOI     "adapter.eoi"                // glm-edge (these embeddings are not in text model)
+#define TN_DEEPSTACK_NORM  "v.deepstack.%d.norm.%s"     // qwen3vl deepstack
+#define TN_DEEPSTACK_FC1   "v.deepstack.%d.fc1.%s"      // qwen3vl deepstack
+#define TN_DEEPSTACK_FC2   "v.deepstack.%d.fc2.%s"      // qwen3vl deepstack
+
+// mimicpmv
+#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
+#define TN_MINICPMV_QUERY      "resampler.query"
+#define TN_MINICPMV_PROJ       "resampler.proj.weight"
+#define TN_MINICPMV_KV_PROJ    "resampler.kv.weight"
+#define TN_MINICPMV_ATTN       "resampler.attn.%s.%s"
+#define TN_MINICPMV_LN         "resampler.ln_%s.%s"
+
+#define TN_GLM_ADAPER_CONV      "adapter.conv.%s"
+#define TN_GLM_ADAPTER_LINEAR   "adapter.linear.linear.%s"
+#define TN_GLM_ADAPTER_NORM_1   "adapter.linear.norm1.%s"
+#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
+#define TN_GLM_ADAPTER_GATE     "adapter.linear.gate.%s"
+#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
+
+// ultravox
+#define TN_CONV1D       "a.conv1d.%d.%s"
+#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
+#define TN_MM_AUDIO_FC  "mm.a.fc.%s" // fully connected layer
+#define TN_MM_NORM_PRE  "mm.a.norm_pre.%s"
+#define TN_MM_NORM_MID  "mm.a.norm_mid.%s"
+
+// cogvlm
+#define TN_MM_POST_FC_NORM "mm.post_fc_norm.%s"
+#define TN_MM_H_TO_4H      "mm.up.%s"
+#define TN_MM_GATE         "mm.gate.%s"
+#define TN_MM_4H_TO_H      "mm.down.%s"
+#define TN_TOK_BOI         "v.boi"
+#define TN_TOK_EOI         "v.eoi"
+
+// (conformer) lfm2
+#define TN_PRE_ENCODE_OUT  "a.pre_encode.out.%s"
+#define TN_FFN_NORM        "%s.blk.%d.ffn_norm.%s"
+#define TN_FFN_NORM_1      "%s.blk.%d.ffn_norm_1.%s"
+#define TN_FFN_UP_1        "%s.blk.%d.ffn_up_1.%s"
+#define TN_FFN_DOWN_1      "%s.blk.%d.ffn_down_1.%s"
+#define TN_POS_BIAS_U      "%s.blk.%d.pos_bias_u"
+#define TN_POS_BIAS_V      "%s.blk.%d.pos_bias_v"
+#define TN_NORM_CONV       "%s.blk.%d.norm_conv.%s"
+#define TN_LINEAR_POS      "%s.blk.%d.linear_pos.%s"
+#define TN_CONV_DW         "%s.blk.%d.conv_dw.%s"
+#define TN_CONV_NORM       "%s.blk.%d.conv_norm.%s"
+#define TN_CONV_PW1        "%s.blk.%d.conv_pw1.%s"
+#define TN_CONV_PW2        "%s.blk.%d.conv_pw2.%s"
+
+// mobilenetv5 (gemma3n) definitions
+#define TN_MNV5_STEM_CONV        "v.conv_stem.conv.weight"
+#define TN_MNV5_STEM_BIAS        "v.conv_stem.conv.bias"
+#define TN_MNV5_STEM_BN          "v.conv_stem.bn.weight"
+
+// Stage 0 Block (Edge Residual)
+#define TN_MNV5_BLK_S0_EXP_W     "v.blk.%d.%d.conv_exp.weight"
+#define TN_MNV5_BLK_S0_BN1_W     "v.blk.%d.%d.bn1.weight"
+#define TN_MNV5_BLK_S0_PWL_W     "v.blk.%d.%d.conv_pwl.weight"
+#define TN_MNV5_BLK_S0_BN2_W     "v.blk.%d.%d.bn2.weight"
+
+// Stage 1+ Block (Universal Inverted Residual)
+#define TN_MNV5_BLK_DW_START_W   "v.blk.%d.%d.dw_start.conv.weight"
+#define TN_MNV5_BLK_DW_START_BN  "v.blk.%d.%d.dw_start.bn.weight"
+#define TN_MNV5_BLK_DW_MID_W     "v.blk.%d.%d.dw_mid.conv.weight"
+#define TN_MNV5_BLK_DW_MID_BN    "v.blk.%d.%d.dw_mid.bn.weight"
+#define TN_MNV5_BLK_PW_EXP_W     "v.blk.%d.%d.pw_exp.conv.weight"
+#define TN_MNV5_BLK_PW_EXP_BN    "v.blk.%d.%d.pw_exp.bn.weight"
+#define TN_MNV5_BLK_PW_PROJ_W    "v.blk.%d.%d.pw_proj.conv.weight"
+#define TN_MNV5_BLK_PW_PROJ_BN   "v.blk.%d.%d.pw_proj.bn.weight"
+#define TN_MNV5_BLK_LAYER_SCALE  "v.blk.%d.%d.layer_scale.gamma"
+
+// Attention Components
+#define TN_MNV5_ATTN_Q_W         "v.blk.%d.%d.attn.query.proj.weight"
+#define TN_MNV5_ATTN_K_W         "v.blk.%d.%d.attn.key.proj.weight"
+#define TN_MNV5_ATTN_V_W         "v.blk.%d.%d.attn.value.proj.weight"
+#define TN_MNV5_ATTN_O_W         "v.blk.%d.%d.attn.output.proj.weight"
+#define TN_MNV5_ATTN_K_DW        "v.blk.%d.%d.attn.key.down_conv.weight"
+#define TN_MNV5_ATTN_K_NORM      "v.blk.%d.%d.attn.key.norm.weight"
+#define TN_MNV5_ATTN_V_DW        "v.blk.%d.%d.attn.value.down_conv.weight"
+#define TN_MNV5_ATTN_V_NORM      "v.blk.%d.%d.attn.value.norm.weight"
+#define TN_MNV5_ATTN_NORM        "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
+
+// MSFA
+#define TN_MNV5_MSFA_FFN_EXP_W   "v.msfa.ffn.pw_exp.conv.weight"
+#define TN_MNV5_MSFA_FFN_EXP_BN  "v.msfa.ffn.pw_exp.bn.weight"
+#define TN_MNV5_MSFA_FFN_PROJ_W  "v.msfa.ffn.pw_proj.conv.weight"
+#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
+#define TN_MNV5_MSFA_NORM        "v.msfa.norm.weight"
+
+
+// align x to upper multiple of n
+#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
+
+// forward declaration
+// TODO: improve this later
+struct clip_ctx;
+
+enum projector_type {
+    PROJECTOR_TYPE_MLP,
+    PROJECTOR_TYPE_MLP_NORM,
+    PROJECTOR_TYPE_LDP,
+    PROJECTOR_TYPE_LDPV2,
+    PROJECTOR_TYPE_MINICPMV,
+    PROJECTOR_TYPE_GLM_EDGE,
+    PROJECTOR_TYPE_QWEN2VL,
+    PROJECTOR_TYPE_QWEN3VL,
+    PROJECTOR_TYPE_GEMMA3,
+    PROJECTOR_TYPE_GEMMA3NV,
+    PROJECTOR_TYPE_GEMMA3NA,
+    PROJECTOR_TYPE_IDEFICS3,
+    PROJECTOR_TYPE_PIXTRAL,
+    PROJECTOR_TYPE_QWEN25VL,
+    PROJECTOR_TYPE_ULTRAVOX,
+    PROJECTOR_TYPE_INTERNVL,
+    PROJECTOR_TYPE_LLAMA4,
+    PROJECTOR_TYPE_QWEN2A,
+    PROJECTOR_TYPE_GLMA,
+    PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
+    PROJECTOR_TYPE_VOXTRAL,
+    PROJECTOR_TYPE_MUSIC_FLAMINGO,
+    PROJECTOR_TYPE_LFM2,
+    PROJECTOR_TYPE_KIMIVL,
+    PROJECTOR_TYPE_LIGHTONOCR,
+    PROJECTOR_TYPE_COGVLM,
+    PROJECTOR_TYPE_JANUS_PRO,
+    PROJECTOR_TYPE_LFM2A,
+    PROJECTOR_TYPE_GLM4V,
+    PROJECTOR_TYPE_YOUTUVL,
+    PROJECTOR_TYPE_KIMIK25,
+    PROJECTOR_TYPE_UNKNOWN,
+};
+
+static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
+    { PROJECTOR_TYPE_MLP,       "mlp" },
+    { PROJECTOR_TYPE_LDP,       "ldp" },
+    { PROJECTOR_TYPE_LDPV2,     "ldpv2"},
+    { PROJECTOR_TYPE_MINICPMV,  "resampler"},
+    { PROJECTOR_TYPE_GLM_EDGE,  "adapter"},
+    { PROJECTOR_TYPE_QWEN2VL,   "qwen2vl_merger"},
+    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
+    { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
+    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
+    { PROJECTOR_TYPE_GEMMA3NV,  "gemma3nv"},
+    { PROJECTOR_TYPE_GEMMA3NA,  "gemma3na"},
+    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
+    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
+    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
+    { PROJECTOR_TYPE_INTERNVL,  "internvl"},
+    { PROJECTOR_TYPE_LLAMA4,    "llama4"},
+    { PROJECTOR_TYPE_QWEN2A,    "qwen2a"},
+    { PROJECTOR_TYPE_GLMA,      "glma"},
+    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"},
+    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"},
+    { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
+    { PROJECTOR_TYPE_LFM2,      "lfm2"},
+    { PROJECTOR_TYPE_KIMIVL,    "kimivl"},
+    { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
+    { PROJECTOR_TYPE_COGVLM,    "cogvlm"},
+    { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
+    { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
+    { PROJECTOR_TYPE_GLM4V,     "glm4v"},
+    { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
+    { PROJECTOR_TYPE_KIMIK25,   "kimik25"},
+};
+
+static projector_type clip_projector_type_from_string(const std::string & str) {
+    for (const auto & pair : PROJECTOR_TYPE_NAMES) {
+        if (pair.second == str) {
+            return pair.first;
+        }
+    }
+    return PROJECTOR_TYPE_UNKNOWN;
+}
+
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// For images, buf.size() == nx*ny*3
+//     Memory layout: RGBRGBRGB...
+// For audio, only one channel is used, buf.size() == nx*ny
+//     nx will be n_frames and ny will be n_mel
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
+//
+// logging
+//
+
+static void clip_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
+
+struct clip_logger_state {
+    ggml_log_callback log_callback;
+    void * log_callback_user_data;
+};
+
+extern struct clip_logger_state g_logger_state;
+
+static void clip_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
+    if (format == NULL) {
+        return;
+    }
+    va_list args_copy;
+    va_copy(args_copy, args);
+    char buffer[128];
+    int len = vsnprintf(buffer, 128, format, args);
+    if (len < 128) {
+        g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
+    } else {
+        char * buffer2 = (char *) calloc(len + 1, sizeof(char));
+        vsnprintf(buffer2, len + 1, format, args_copy);
+        buffer2[len] = 0;
+        g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
+        free(buffer2);
+    }
+    va_end(args_copy);
+}
+
+static void clip_log_internal(enum ggml_log_level level, const char * format, ...) {
+    va_list args;
+    va_start(args, format);
+    clip_log_internal_v(level, format, args);
+    va_end(args);
+}
+
+#define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
+#define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
+#define LOG_ERR(...) clip_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#define LOG_CNT(...) clip_log_internal(GGML_LOG_LEVEL_CONT,  __VA_ARGS__)
+
+//
+// cpp wrappers
+//
+
+// wrapper for clip_image_size
+struct clip_image_size_deleter {
+    void operator()(clip_image_size * val) { clip_image_size_free(val); }
+};
+typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
+
+// wrapper for clip_image_u8
+struct clip_image_u8_deleter {
+    void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
+};
+typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
+
+// wrapper for clip_image_f32
+struct clip_image_f32_deleter {
+    void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
+};
+typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
+
+struct clip_image_u8_batch {
+    std::vector<clip_image_u8_ptr> entries;
+};
+
+struct clip_image_f32_batch {
+    std::vector<clip_image_f32_ptr> entries;
+    bool is_audio = false;
+
+    // for llava-uhd style models, we need to know the grid size
+    // note: entries.size() == grid_x * grid_y + 1 (one overview image)
+    int grid_x = 0;
+    int grid_y = 0;
+
+    clip_image_f32_batch clone() const {
+        clip_image_f32_batch new_batch{
+            /* entries  */ {},
+            /* is_audio */ is_audio,
+            /* grid_x   */ grid_x,
+            /* grid_y   */ grid_y,
+        };
+        new_batch.entries.reserve(entries.size());
+        for (const auto & entry : entries) {
+            new_batch.entries.emplace_back(new clip_image_f32(*entry));
+        }
+        return new_batch;
+    }
+};
+
+//
+// common utils
+//
+
+static std::string string_format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), buf.size());
+}
+
+static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return;
+    }
+    std::string builder;
+    builder.reserve(s.length());
+    size_t pos = 0;
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
+    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
+}
+
+// split string by a `std::string delim` instead of `char delim`
+static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
+    std::vector<std::string> tokens;
+    size_t pos = 0;
+    std::string token;
+    while ((pos = s.find(delimiter)) != std::string::npos) {
+        token = s.substr(0, pos);
+        tokens.push_back(token);
+        s.erase(0, pos + delimiter.length());
+    }
+    tokens.push_back(s);
+    return tokens;
+}
+
+//
+// gguf utils
+//
+
+static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
+    switch (type) {
+        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
+        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
+        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
+        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
+        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
+        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
+        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
+        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
+        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
+        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
+        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        default:                return string_format("unknown type %d", type);
+    }
+}
+
+static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
+    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+    switch (type) {
+        case GGUF_TYPE_STRING:
+            return gguf_get_val_str(ctx_gguf, i);
+        case GGUF_TYPE_ARRAY:
+            {
+                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+                int arr_n = gguf_get_arr_n(ctx_gguf, i);
+                const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
+                std::stringstream ss;
+                ss << "[";
+                for (int j = 0; j < arr_n; j++) {
+                    if (arr_type == GGUF_TYPE_STRING) {
+                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+                        // escape quotes
+                        string_replace_all(val, "\\", "\\\\");
+                        string_replace_all(val, "\"", "\\\"");
+                        ss << '"' << val << '"';
+                    } else if (arr_type == GGUF_TYPE_ARRAY) {
+                        ss << "???";
+                    } else {
+                        ss << gguf_data_to_str(arr_type, data, j);
+                    }
+                    if (j < arr_n - 1) {
+                        ss << ", ";
+                    }
+                }
+                ss << "]";
+                return ss.str();
+            }
+        default:
+            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+    }
+}
+
+//
+// debugging
+//
+
+static void print_tensor_shape(ggml_tensor * t) {
+    printf("%s.shape = [", t->name);
+    for (int i = 0; i < ggml_n_dims(t); ++i) {
+        printf("%" PRId64, t->ne[i]);
+        if (i < ggml_n_dims(t) - 1) {
+            printf(", ");
+        }
+    }
+    printf("]\n");
+}
+
+static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
+    ggml_type type = t->type;
+    int64_t * ne = t->ne;
+    size_t * nb = t->nb;
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        printf("%s.data: [\n", t->name);
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            if (i2 == n && ne[2] > 2*n) {
+                printf("     ..., \n");
+                i2 = ne[2] - n;
+            }
+            printf("     [\n");
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                if (i1 == n && ne[1] > 2*n) {
+                    printf("      ..., \n");
+                    i1 = ne[1] - n;
+                }
+                printf("      [");
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    if (i0 == n && ne[0] > 2*n) {
+                        printf("..., ");
+                        i0 = ne[0] - n;
+                    }
+                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+                    float v;
+                    if (type == GGML_TYPE_F16) {
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+                    } else if (type == GGML_TYPE_F32) {
+                        v = *(float *) &data[i];
+                    } else if (type == GGML_TYPE_I32) {
+                        v = (float) *(int32_t *) &data[i];
+                    } else if (type == GGML_TYPE_I16) {
+                        v = (float) *(int16_t *) &data[i];
+                    } else if (type == GGML_TYPE_I8) {
+                        v = (float) *(int8_t *) &data[i];
+                    } else {
+                        GGML_ABORT("fatal error");
+                    }
+                    printf("%8.4f", v);
+                    if (i0 < ne[0] - 1) printf(", ");
+                }
+                printf("],\n");
+            }
+            printf("     ],\n");
+        }
+        printf("    ]\n");
+    }
+}
+
+void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value);
+
+//
+// API used internally with mtmd
+//
+
+projector_type clip_get_projector_type(const struct clip_ctx * ctx);
diff --git a/llama.cpp/tools/mtmd/clip-model.h b/llama.cpp/tools/mtmd/clip-model.h
new file mode 100644
index 0000000..d4ff915
--- /dev/null
+++ b/llama.cpp/tools/mtmd/clip-model.h
@@ -0,0 +1,389 @@
+#pragma once
+
+#include "ggml.h"
+#include "clip.h"
+#include "clip-impl.h"
+
+#include <array>
+#include <vector>
+#include <unordered_set>
+#include <cstdint>
+#include <cmath>
+
+enum ffn_op_type {
+    FFN_GELU,
+    FFN_GELU_ERF,
+    FFN_SILU,
+    FFN_GELU_QUICK,
+};
+
+enum norm_type {
+    NORM_TYPE_NORMAL,
+    NORM_TYPE_RMS,
+};
+
+enum patch_merge_type {
+    PATCH_MERGE_FLAT,
+    PATCH_MERGE_SPATIAL_UNPAD,
+};
+
+struct clip_hparams {
+    int32_t image_size = 0;
+    int32_t patch_size = 0;
+    int32_t n_embd = 0;
+    int32_t n_ff = 0;
+    int32_t projection_dim = 0;
+    int32_t n_head = 0;
+    int32_t n_layer = 0;
+    // idefics3
+    int32_t image_longest_edge = 0;
+    int32_t image_min_pixels = -1;
+    int32_t image_max_pixels = -1;
+    int32_t n_merge = 0; // number of patch merges **per-side**
+
+    float image_mean[3];
+    float image_std[3];
+
+    // for models using dynamic image size, we need to have a smaller image size to warmup
+    // otherwise, user will get OOM everytime they load the model
+    int32_t warmup_image_size = 0;
+    int32_t warmup_audio_size = 3000;
+
+    ffn_op_type ffn_op = FFN_GELU;
+
+    patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
+
+    float eps = 1e-6;
+    float rope_theta = 0.0;
+
+    std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
+    int32_t image_crop_resolution;
+    std::unordered_set<int32_t> vision_feature_layer;
+    int32_t attn_window_size = 0;
+    int32_t n_wa_pattern = 0;
+    std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
+
+    // audio
+    int32_t n_mel_bins = 0; // whisper preprocessor
+    int32_t proj_stack_factor = 0; // ultravox
+
+    // audio-to-mel preprocessor params
+    int32_t audio_chunk_len   = -1; // in seconds
+    int32_t audio_sample_rate = -1;
+    int32_t audio_n_fft       = -1;
+    int32_t audio_window_len  = -1;
+    int32_t audio_hop_len     = -1;
+
+    // legacy
+    bool has_llava_projector = false;
+    int minicpmv_version = 0;
+    int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
+
+    // custom value provided by user, can be undefined if not set
+    int32_t custom_image_min_tokens = -1;
+    int32_t custom_image_max_tokens = -1;
+
+    void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
+        const int cur_merge = n_merge == 0 ? 1 : n_merge;
+        const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
+        image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
+        image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
+        warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
+    }
+
+    void set_warmup_n_tokens(int n_tokens) {
+        int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
+        GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
+        const int cur_merge = n_merge == 0 ? 1 : n_merge;
+        warmup_image_size = n_tok_per_side * patch_size * cur_merge;
+        // TODO: support warmup size for custom token numbers
+    }
+};
+
+struct clip_layer {
+    // attention
+    ggml_tensor * k_w = nullptr;
+    ggml_tensor * k_b = nullptr;
+    ggml_tensor * q_w = nullptr;
+    ggml_tensor * q_b = nullptr;
+    ggml_tensor * v_w = nullptr;
+    ggml_tensor * v_b = nullptr;
+    ggml_tensor * qkv_w = nullptr;
+    ggml_tensor * qkv_b = nullptr;
+
+    ggml_tensor * o_w = nullptr;
+    ggml_tensor * o_b = nullptr;
+
+    ggml_tensor * k_norm = nullptr;
+    ggml_tensor * q_norm = nullptr;
+
+    // layernorm 1
+    ggml_tensor * ln_1_w = nullptr;
+    ggml_tensor * ln_1_b = nullptr;
+
+    ggml_tensor * ff_up_w = nullptr;
+    ggml_tensor * ff_up_b = nullptr;
+    ggml_tensor * ff_gate_w = nullptr;
+    ggml_tensor * ff_gate_b = nullptr;
+    ggml_tensor * ff_down_w = nullptr;
+    ggml_tensor * ff_down_b = nullptr;
+
+    // layernorm 2
+    ggml_tensor * ln_2_w = nullptr;
+    ggml_tensor * ln_2_b = nullptr;
+
+    // layer scale (no bias)
+    ggml_tensor * ls_1_w = nullptr;
+    ggml_tensor * ls_2_w = nullptr;
+
+    // qwen3vl deepstack merger
+    ggml_tensor * deepstack_norm_w = nullptr;
+    ggml_tensor * deepstack_norm_b = nullptr;
+    ggml_tensor * deepstack_fc1_w = nullptr;
+    ggml_tensor * deepstack_fc1_b = nullptr;
+    ggml_tensor * deepstack_fc2_w = nullptr;
+    ggml_tensor * deepstack_fc2_b = nullptr;
+
+    // lfm2
+    ggml_tensor * ff_norm_w     = nullptr;
+    ggml_tensor * ff_norm_b     = nullptr;
+    ggml_tensor * ff_norm_1_w   = nullptr;
+    ggml_tensor * ff_norm_1_b   = nullptr;
+    ggml_tensor * ff_up_1_w     = nullptr;
+    ggml_tensor * ff_up_1_b     = nullptr;
+    ggml_tensor * ff_down_1_w   = nullptr;
+    ggml_tensor * ff_down_1_b   = nullptr;
+    ggml_tensor * pos_bias_u    = nullptr;
+    ggml_tensor * pos_bias_v    = nullptr;
+    ggml_tensor * norm_conv_w   = nullptr;
+    ggml_tensor * norm_conv_b   = nullptr;
+    ggml_tensor * linear_pos_w  = nullptr;
+
+    ggml_tensor * conv_norm_w   = nullptr;
+    ggml_tensor * conv_norm_b   = nullptr;
+    ggml_tensor * conv_dw_w     = nullptr;
+    ggml_tensor * conv_dw_b     = nullptr;
+    ggml_tensor * conv_pw1_w    = nullptr;
+    ggml_tensor * conv_pw1_b    = nullptr;
+    ggml_tensor * conv_pw2_w    = nullptr;
+    ggml_tensor * conv_pw2_b    = nullptr;
+
+    bool has_deepstack() const {
+        return deepstack_fc1_w != nullptr;
+    }
+};
+
+// Expanded MobileNetV5 block structure for Gemma3n vision encoder
+struct mobilenetv5_block {
+    // Stage 0 (Edge Residual)
+    ggml_tensor * s0_conv_exp_w = nullptr;
+    ggml_tensor * s0_bn1_w      = nullptr;
+    ggml_tensor * s0_conv_pwl_w = nullptr;
+    ggml_tensor * s0_bn2_w      = nullptr;
+
+    // Stage 1+ (Universal Inverted Residual)
+    ggml_tensor * dw_start_w    = nullptr;
+    ggml_tensor * dw_start_bn_w = nullptr;
+
+    ggml_tensor * pw_exp_w      = nullptr;
+    ggml_tensor * pw_exp_bn_w   = nullptr;
+
+    ggml_tensor * dw_mid_w      = nullptr;
+    ggml_tensor * dw_mid_bn_w   = nullptr;
+
+    ggml_tensor * pw_proj_w     = nullptr;
+    ggml_tensor * pw_proj_bn_w  = nullptr;
+
+    ggml_tensor * layer_scale_w = nullptr;
+
+    // Attention (MQA) components
+    ggml_tensor * attn_q_w = nullptr;
+    ggml_tensor * attn_k_w = nullptr;
+    ggml_tensor * attn_v_w = nullptr;
+    ggml_tensor * attn_o_w = nullptr;
+
+    // Optional downsampling/norm in attention
+    ggml_tensor * attn_k_dw_w   = nullptr;
+    ggml_tensor * attn_k_norm_w = nullptr;
+    ggml_tensor * attn_v_dw_w   = nullptr;
+    ggml_tensor * attn_v_norm_w = nullptr;
+
+    // Block norm (often present in attention blocks)
+    ggml_tensor * attn_norm_w   = nullptr;
+};
+
+struct clip_model {
+    clip_modality modality = CLIP_MODALITY_VISION;
+    projector_type proj_type = PROJECTOR_TYPE_MLP;
+    clip_hparams hparams;
+
+    // embeddings
+    ggml_tensor * class_embedding = nullptr;
+    ggml_tensor * patch_embeddings_0 = nullptr;
+    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
+    ggml_tensor * patch_bias = nullptr;
+    ggml_tensor * position_embeddings = nullptr;
+    ggml_tensor * norm_embd_w = nullptr;
+    ggml_tensor * norm_embd_b = nullptr;
+
+    ggml_tensor * pre_ln_w = nullptr;
+    ggml_tensor * pre_ln_b = nullptr;
+
+    std::vector<clip_layer> layers;
+
+    int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
+
+    ggml_tensor * post_ln_w;
+    ggml_tensor * post_ln_b;
+
+    ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
+    ggml_tensor * mm_fc_w;
+    ggml_tensor * mm_fc_b;
+    ggml_tensor * mm_ffn_up_w = nullptr;
+    ggml_tensor * mm_ffn_up_b = nullptr;
+    ggml_tensor * mm_ffn_gate_w = nullptr;
+    ggml_tensor * mm_ffn_gate_b = nullptr;
+    ggml_tensor * mm_ffn_down_w = nullptr;
+    ggml_tensor * mm_ffn_down_b = nullptr;
+    ggml_tensor * mm_post_norm_w = nullptr;
+    ggml_tensor * mm_post_norm_b = nullptr;
+
+    // LLaVA projection
+    ggml_tensor * mm_input_norm_w = nullptr;
+    ggml_tensor * mm_input_norm_b = nullptr;
+    ggml_tensor * mm_0_w = nullptr;
+    ggml_tensor * mm_0_b = nullptr;
+    ggml_tensor * mm_2_w = nullptr;
+    ggml_tensor * mm_2_b = nullptr;
+
+    ggml_tensor * image_newline = nullptr;
+
+    // Yi type models with mlp+normalization projection
+    ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
+    ggml_tensor * mm_1_b = nullptr;
+    ggml_tensor * mm_3_w = nullptr;
+    ggml_tensor * mm_3_b = nullptr;
+    ggml_tensor * mm_4_w = nullptr;
+    ggml_tensor * mm_4_b = nullptr;
+
+    // GLMV-Edge projection
+    ggml_tensor * mm_model_adapter_conv_w = nullptr;
+    ggml_tensor * mm_model_adapter_conv_b = nullptr;
+
+    // MobileVLM projection
+    ggml_tensor * mm_model_mlp_1_w = nullptr;
+    ggml_tensor * mm_model_mlp_1_b = nullptr;
+    ggml_tensor * mm_model_mlp_3_w = nullptr;
+    ggml_tensor * mm_model_mlp_3_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
+
+    // MobileVLM_V2 projection
+    ggml_tensor * mm_model_mlp_0_w = nullptr;
+    ggml_tensor * mm_model_mlp_0_b = nullptr;
+    ggml_tensor * mm_model_mlp_2_w = nullptr;
+    ggml_tensor * mm_model_mlp_2_b = nullptr;
+    ggml_tensor * mm_model_peg_0_w = nullptr;
+    ggml_tensor * mm_model_peg_0_b = nullptr;
+
+    // MINICPMV projection
+    ggml_tensor * mm_model_pos_embed_k = nullptr;
+    ggml_tensor * mm_model_query = nullptr;
+    ggml_tensor * mm_model_proj = nullptr;
+    ggml_tensor * mm_model_kv_proj = nullptr;
+    ggml_tensor * mm_model_attn_q_w = nullptr;
+    ggml_tensor * mm_model_attn_q_b = nullptr;
+    ggml_tensor * mm_model_attn_k_w = nullptr;
+    ggml_tensor * mm_model_attn_k_b = nullptr;
+    ggml_tensor * mm_model_attn_v_w = nullptr;
+    ggml_tensor * mm_model_attn_v_b = nullptr;
+    ggml_tensor * mm_model_attn_o_w = nullptr;
+    ggml_tensor * mm_model_attn_o_b = nullptr;
+    ggml_tensor * mm_model_ln_q_w = nullptr;
+    ggml_tensor * mm_model_ln_q_b = nullptr;
+    ggml_tensor * mm_model_ln_kv_w = nullptr;
+    ggml_tensor * mm_model_ln_kv_b = nullptr;
+    ggml_tensor * mm_model_ln_post_w = nullptr;
+    ggml_tensor * mm_model_ln_post_b = nullptr;
+
+    // gemma3
+    ggml_tensor * mm_input_proj_w = nullptr;
+    ggml_tensor * mm_soft_emb_norm_w = nullptr;
+
+    // mobilenetv5 for gemma3n
+    std::vector<mobilenetv5_block> mobilenet_blocks;
+    std::vector<int> mobilenet_stage_ends;
+    ggml_tensor * mobilenet_stem_conv_w = nullptr;
+    ggml_tensor * mobilenet_stem_conv_b = nullptr;
+    ggml_tensor * mobilenet_stem_norm_w = nullptr;
+    ggml_tensor * mm_post_proj_norm_w = nullptr;
+
+    // Multi-Scale Fusion Adapter (MSFA) components
+    ggml_tensor * msfa_concat_conv_w = nullptr;
+    ggml_tensor * msfa_concat_norm_w = nullptr;
+    ggml_tensor * msfa_ffn_expand_w = nullptr;
+    ggml_tensor * msfa_ffn_project_w = nullptr;
+    ggml_tensor * msfa_ffn_expand_bn = nullptr;
+    ggml_tensor * msfa_ffn_project_bn = nullptr;
+
+
+    // pixtral, glm4v
+    ggml_tensor * token_embd_img_break = nullptr;
+    ggml_tensor * mm_patch_merger_w = nullptr;
+    ggml_tensor * mm_patch_merger_b = nullptr;
+
+    // ultravox / whisper encoder
+    ggml_tensor * conv1d_1_w = nullptr;
+    ggml_tensor * conv1d_1_b = nullptr;
+    ggml_tensor * conv1d_2_w = nullptr;
+    ggml_tensor * conv1d_2_b = nullptr;
+    ggml_tensor * mm_norm_pre_w = nullptr;
+    ggml_tensor * mm_norm_pre_b = nullptr;
+    ggml_tensor * mm_norm_mid_w = nullptr;
+
+    // cogvlm
+    ggml_tensor * mm_post_fc_norm_w = nullptr;
+    ggml_tensor * mm_post_fc_norm_b = nullptr;
+    ggml_tensor * mm_h_to_4h_w = nullptr;
+    ggml_tensor * mm_gate_w = nullptr;
+    ggml_tensor * mm_4h_to_h_w = nullptr;
+    ggml_tensor * mm_boi = nullptr;
+    ggml_tensor * mm_eoi = nullptr;
+
+    // lfm2 audio
+    std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
+    std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
+    ggml_tensor * pre_encode_out_w = nullptr;
+    ggml_tensor * pre_encode_out_b = nullptr;
+
+    bool audio_has_avgpool() const {
+        return proj_type == PROJECTOR_TYPE_QWEN2A
+            || proj_type == PROJECTOR_TYPE_VOXTRAL
+            || proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
+    }
+
+    bool audio_has_stack_frames() const {
+        return proj_type == PROJECTOR_TYPE_ULTRAVOX
+            || proj_type == PROJECTOR_TYPE_VOXTRAL;
+    }
+};
+
+const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx);
diff --git a/llama.cpp/tools/mtmd/clip.cpp b/llama.cpp/tools/mtmd/clip.cpp
new file mode 100644
index 0000000..eeccb4c
--- /dev/null
+++ b/llama.cpp/tools/mtmd/clip.cpp
@@ -0,0 +1,4080 @@
+#include "clip.h"
+#include "clip-impl.h"
+#include "clip-model.h"
+#include "clip-graph.h"
+#include "models/models.h"
+
+#include "ggml.h"
+#include "ggml-cpp.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "gguf.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <stdexcept>
+#include <unordered_set>
+#include <vector>
+#include <cinttypes>
+#include <limits>
+#include <array>
+#include <functional>
+
+struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
+
+//#define CLIP_DEBUG_FUNCTIONS
+
+#ifdef CLIP_DEBUG_FUNCTIONS
+static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
+        return;
+    }
+
+    // PPM header: P6 format, width, height, and max color value
+    file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
+
+    // Write pixel data
+    for (size_t i = 0; i < img.buf.size(); i += 3) {
+        // PPM expects binary data in RGB format, which matches our image buffer
+        file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
+    }
+
+    file.close();
+}
+
+static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
+        return;
+    }
+
+    int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
+    int bytesPerPixel = 3;
+    int widthInBytes = img.nx * bytesPerPixel;
+    int paddingAmount = (4 - (widthInBytes % 4)) % 4;
+    int stride = widthInBytes + paddingAmount;
+
+    // Bitmap file header
+    unsigned char fileHeader[14] = {
+        'B','M',     // Signature
+        0,0,0,0,    // Image file size in bytes
+        0,0,0,0,    // Reserved
+        54,0,0,0    // Start of pixel array
+    };
+
+    // Total file size
+    fileSize = 54 + (stride * img.ny);
+    fileHeader[2] = (unsigned char)(fileSize);
+    fileHeader[3] = (unsigned char)(fileSize >> 8);
+    fileHeader[4] = (unsigned char)(fileSize >> 16);
+    fileHeader[5] = (unsigned char)(fileSize >> 24);
+
+    // Bitmap information header (BITMAPINFOHEADER)
+    unsigned char infoHeader[40] = {
+        40,0,0,0,   // Size of this header (40 bytes)
+        0,0,0,0,    // Image width
+        0,0,0,0,    // Image height
+        1,0,        // Number of color planes
+        24,0,       // Bits per pixel
+        0,0,0,0,    // No compression
+        0,0,0,0,    // Image size (can be 0 for no compression)
+        0,0,0,0,    // X pixels per meter (not specified)
+        0,0,0,0,    // Y pixels per meter (not specified)
+        0,0,0,0,    // Total colors (color table not used)
+        0,0,0,0     // Important colors (all are important)
+    };
+
+    // Width and height in the information header
+    infoHeader[4] = (unsigned char)(img.nx);
+    infoHeader[5] = (unsigned char)(img.nx >> 8);
+    infoHeader[6] = (unsigned char)(img.nx >> 16);
+    infoHeader[7] = (unsigned char)(img.nx >> 24);
+    infoHeader[8] = (unsigned char)(img.ny);
+    infoHeader[9] = (unsigned char)(img.ny >> 8);
+    infoHeader[10] = (unsigned char)(img.ny >> 16);
+    infoHeader[11] = (unsigned char)(img.ny >> 24);
+
+    // Write file headers
+    file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
+    file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
+
+    // Pixel data
+    std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
+    for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
+        for (int x = 0; x < img.nx; ++x) {
+            // Each pixel
+            size_t pixelIndex = (y * img.nx + x) * 3;
+            unsigned char pixel[3] = {
+                img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
+                img.buf[pixelIndex + 1],
+                img.buf[pixelIndex]
+            };
+            file.write(reinterpret_cast<char*>(pixel), 3);
+        }
+        // Write padding for the row
+        file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
+    }
+
+    file.close();
+}
+
+// debug function to convert f32 to u8
+static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
+    dst.nx = src.nx;
+    dst.ny = src.ny;
+    dst.buf.resize(3 * src.nx * src.ny);
+    for (size_t i = 0; i < src.buf.size(); ++i) {
+        dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
+    }
+}
+#endif
+
+
+struct clip_ctx {
+    clip_model model;
+
+    gguf_context_ptr ctx_gguf;
+    ggml_context_ptr ctx_data;
+
+    std::vector<uint8_t> buf_compute_meta;
+
+    std::vector<ggml_backend_t> backend_ptrs;
+    std::vector<ggml_backend_buffer_type_t> backend_buft;
+
+    ggml_backend_t backend = nullptr;
+    ggml_backend_t backend_cpu = nullptr;
+    ggml_backend_buffer_ptr buf;
+
+
+    int max_nodes = 8192;
+    ggml_backend_sched_ptr sched;
+    clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
+    bool is_allocated = false;
+
+    clip_ctx(clip_context_params & ctx_params) {
+        flash_attn_type = ctx_params.flash_attn_type;
+        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+        if (!backend_cpu) {
+            throw std::runtime_error("failed to initialize CPU backend");
+        }
+        if (ctx_params.use_gpu) {
+            auto backend_name = std::getenv("MTMD_BACKEND_DEVICE");
+            if (backend_name != nullptr) {
+                backend = ggml_backend_init_by_name(backend_name, nullptr);
+                if (!backend) {
+                    LOG_WRN("%s: Warning: Failed to initialize \"%s\" backend, falling back to default GPU backend\n", __func__, backend_name);
+                }
+            }
+            if (!backend) {
+                backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
+                backend = backend ? backend : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
+            }
+        }
+
+        if (backend) {
+            LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
+            backend_ptrs.push_back(backend);
+            backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
+        } else {
+            backend = backend_cpu;
+            LOG_INF("%s: CLIP using CPU backend\n", __func__);
+        }
+
+        if (ctx_params.image_min_tokens > 0) {
+            model.hparams.custom_image_min_tokens = ctx_params.image_min_tokens;
+        }
+        if (ctx_params.image_max_tokens > 0) {
+            model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
+        }
+
+        backend_ptrs.push_back(backend_cpu);
+        backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
+
+        sched.reset(
+            ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
+        );
+
+        if (ctx_params.cb_eval != nullptr) {
+            ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
+        }
+    }
+
+    ~clip_ctx() {
+        ggml_backend_free(backend);
+        if (backend != backend_cpu) {
+            ggml_backend_free(backend_cpu);
+        }
+    }
+
+    // this function is added so that we don't change too much of the existing code
+    projector_type proj_type() const {
+        return model.proj_type;
+    }
+};
+
+//
+// clip_graph
+//
+
+clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
+        model(ctx->model),
+        hparams(model.hparams),
+        proj_type(ctx->proj_type()),
+        img(img),
+        patch_size(hparams.patch_size),
+        n_patches_x(img.nx / patch_size),
+        n_patches_y(img.ny / patch_size),
+        n_patches(n_patches_x * n_patches_y),
+        n_embd(hparams.n_embd),
+        n_head(hparams.n_head),
+        d_head(n_embd / n_head),
+        n_layer(hparams.n_layer),
+        n_mmproj_embd(clip_n_mmproj_embd(ctx)),
+        eps(hparams.eps),
+        kq_scale(1.0f / sqrtf((float)d_head)),
+        flash_attn_type(ctx->flash_attn_type) {
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
+        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+    ctx0_ptr.reset(ggml_init(params));
+    ctx0 = ctx0_ptr.get();
+    gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
+}
+
+void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
+    if (il >= 0) {
+        ggml_format_name(cur, "%s-%d", name, il);
+    } else {
+        ggml_set_name(cur, name);
+    }
+}
+
+// siglip2 naflex
+ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
+    ggml_tensor * pos_embd = model.position_embeddings;
+    const int height       = img.ny / patch_size;
+    const int width        = img.nx / patch_size;
+    const uint32_t mode    = interpolation_mode;
+    const int n_per_side   = (int)std::sqrt(pos_embd->ne[1]);
+
+    GGML_ASSERT(pos_embd);
+
+    if (height == n_per_side && width == n_per_side) {
+        return pos_embd;
+    }
+
+    pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side);  // -> (n_embd, n_per_side, n_per_side)
+    pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3);                         // -> (n_per_side, n_per_side, n_embd)
+    pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd)
+    pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3);                         // -> (n_embd, width, height)
+    pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);             // -> (n_embd, width * height)
+
+    return pos_embd;
+}
+
+// build vision transformer (ViT) cgraph
+// this function should cover most of the models
+// if your model has specific features, you should probably duplicate this function
+ggml_tensor * clip_graph::build_vit(
+            ggml_tensor * inp,
+            int64_t n_pos,
+            norm_type norm_t,
+            ffn_op_type ffn_t,
+            ggml_tensor * learned_pos_embd,
+            std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
+        ) {
+    if (learned_pos_embd) {
+        inp = ggml_add(ctx0, inp, learned_pos_embd);
+        cb(inp, "pos_embed", -1);
+    }
+
+    ggml_tensor * inpL = inp;
+
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+        cb(inpL, "pre_ln", -1);
+    }
+
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        auto & layer = model.layers[il];
+        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+        cb(cur, "layer_inp_normed", il);
+
+        // self-attention
+        {
+            ggml_tensor * Qcur = nullptr;
+            ggml_tensor * Kcur = nullptr;
+            ggml_tensor * Vcur = nullptr;
+            if (layer.qkv_w != nullptr) {
+                // fused qkv
+                cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+                if (layer.qkv_b != nullptr) {
+                    cur = ggml_add(ctx0, cur, layer.qkv_b);
+                }
+
+                Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ 0);
+
+                Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ ggml_row_size(cur->type, n_embd));
+
+                Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ ggml_row_size(cur->type, 2 * n_embd));
+
+                // TODO: q/k norm requires row size == n_embd, while here it's d_head
+                // we can add support in the future if needed
+                GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
+
+            } else {
+                // separate q, k, v
+                Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+                if (layer.q_b) {
+                    Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+                }
+
+                Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+                if (layer.k_b) {
+                    Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+                }
+
+                Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+                if (layer.v_b) {
+                    Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+                }
+
+                if (layer.q_norm) {
+                    Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
+                    cb(Qcur, "Qcur_norm", il);
+                }
+
+                if (layer.k_norm) {
+                    Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
+                    cb(Kcur, "Kcur_norm", il);
+                }
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+            }
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            if (add_pos) {
+                Qcur = add_pos(Qcur, layer);
+                Kcur = add_pos(Kcur, layer);
+                cb(Qcur, "Qcur_pos", il);
+                cb(Kcur, "Kcur_pos", il);
+            }
+
+            cur = build_attn(layer.o_w, layer.o_b,
+                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+
+        if (layer.ls_1_w) {
+            cur = ggml_mul(ctx0, cur, layer.ls_1_w);
+            cb(cur, "attn_out_scaled", il);
+        }
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, inpL);
+
+        inpL = cur; // inpL = residual, cur = hidden_states
+
+        cb(cur, "ffn_inp", il);
+
+        // layernorm2
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+        cb(cur, "ffn_inp_normed", il);
+
+        // ffn
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            ffn_t, il);
+
+        cb(cur, "ffn_out", il);
+
+        if (layer.ls_2_w) {
+            cur = ggml_mul(ctx0, cur, layer.ls_2_w);
+            cb(cur, "ffn_out_scaled", il);
+        }
+
+        // residual 2
+        cur = ggml_add(ctx0, inpL, cur);
+        cb(cur, "layer_out", il);
+
+        inpL = cur;
+    }
+
+    if (model.audio_has_avgpool()) {
+        ggml_tensor * cur = inpL;
+        cur = ggml_transpose(ctx0, cur);
+        cur = ggml_cont(ctx0, cur);
+        cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0);
+        cur = ggml_transpose(ctx0, cur);
+        cur = ggml_cont(ctx0, cur);
+        inpL = cur;
+    }
+
+    // post-layernorm
+    if (model.post_ln_w) {
+        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
+    }
+    return inpL;
+}
+
+// build the input after conv2d (inp_raw --> patches)
+// returns tensor with shape [n_embd, n_patches]
+ggml_tensor * clip_graph::build_inp() {
+    ggml_tensor * inp_raw = build_inp_raw();
+    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
+    inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
+    if (model.patch_bias) {
+        inp = ggml_add(ctx0, inp, model.patch_bias);
+        cb(inp, "patch_bias", -1);
+    }
+    return inp;
+}
+
+ggml_tensor * clip_graph::build_inp_raw(int channels) {
+    ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
+    ggml_set_name(inp_raw, "inp_raw");
+    ggml_set_input(inp_raw);
+    return inp_raw;
+}
+
+ggml_tensor * clip_graph::build_norm(
+        ggml_tensor * cur,
+        ggml_tensor * mw,
+        ggml_tensor * mb,
+        norm_type type,
+        float norm_eps,
+        int il) const {
+
+    cur = type == NORM_TYPE_RMS
+        ? ggml_rms_norm(ctx0, cur, norm_eps)
+        : ggml_norm(ctx0, cur, norm_eps);
+
+    if (mw) {
+        cur = ggml_mul(ctx0, cur, mw);
+        cb(cur, "norm_w", il);
+    }
+
+    if (mb) {
+        cur = ggml_add(ctx0, cur, mb);
+        cb(cur, "norm_b", il);
+    }
+
+    return cur;
+}
+
+ggml_tensor * clip_graph::build_ffn(
+        ggml_tensor * cur,
+        ggml_tensor * up,
+        ggml_tensor * up_b,
+        ggml_tensor * gate,
+        ggml_tensor * gate_b,
+        ggml_tensor * down,
+        ggml_tensor * down_b,
+        ffn_op_type type_op,
+        int il) const {
+
+    ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
+    cb(tmp, "ffn_up", il);
+
+    if (up_b) {
+        tmp = ggml_add(ctx0, tmp, up_b);
+        cb(tmp, "ffn_up_b", il);
+    }
+
+    if (gate) {
+        cur = ggml_mul_mat(ctx0, gate, cur);
+        cb(cur, "ffn_gate", il);
+
+        if (gate_b) {
+            cur = ggml_add(ctx0, cur, gate_b);
+            cb(cur, "ffn_gate_b", il);
+        }
+    } else {
+        cur = tmp;
+    }
+
+    // we only support parallel ffn for now
+    switch (type_op) {
+        case FFN_SILU:
+            if (gate) {
+                cur = ggml_swiglu_split(ctx0, cur, tmp);
+                cb(cur, "ffn_swiglu", il);
+            } else {
+                cur = ggml_silu(ctx0, cur);
+                cb(cur, "ffn_silu", il);
+            } break;
+        case FFN_GELU:
+            if (gate) {
+                cur = ggml_geglu_split(ctx0, cur, tmp);
+                cb(cur, "ffn_geglu", il);
+            } else {
+                cur = ggml_gelu(ctx0, cur);
+                cb(cur, "ffn_gelu", il);
+            } break;
+        case FFN_GELU_ERF:
+            if (gate) {
+                cur = ggml_geglu_erf_split(ctx0, cur, tmp);
+                cb(cur, "ffn_geglu_erf", il);
+            } else {
+                cur = ggml_gelu_erf(ctx0, cur);
+                cb(cur, "ffn_gelu_erf", il);
+            } break;
+        case FFN_GELU_QUICK:
+            if (gate) {
+                cur = ggml_geglu_quick_split(ctx0, cur, tmp);
+                cb(cur, "ffn_geglu_quick", il);
+            } else {
+                cur = ggml_gelu_quick(ctx0, cur);
+                cb(cur, "ffn_gelu_quick", il);
+            } break;
+    }
+
+    if (down) {
+        cur = ggml_mul_mat(ctx0, down, cur);
+    }
+
+    if (down_b) {
+        cb(cur, "ffn_down", il);
+    }
+
+    if (down_b) {
+        cur = ggml_add(ctx0, cur, down_b);
+    }
+
+    return cur;
+}
+
+ggml_tensor * clip_graph::build_attn(
+        ggml_tensor * wo,
+        ggml_tensor * wo_b,
+        ggml_tensor * q_cur,
+        ggml_tensor * k_cur,
+        ggml_tensor * v_cur,
+        ggml_tensor * kq_mask,
+        float kq_scale,
+        int il) const {
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    ggml_build_forward_expand(gf, q_cur);
+    ggml_build_forward_expand(gf, k_cur);
+    ggml_build_forward_expand(gf, v_cur);
+
+    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    //cb(q, "q", il);
+
+    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
+    //cb(k, "k", il);
+
+    ggml_tensor * cur;
+
+    if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+        ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
+
+        k = ggml_cast(ctx0, k, GGML_TYPE_F16);
+        v = ggml_cast(ctx0, v, GGML_TYPE_F16);
+
+        cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f);
+        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
+
+        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
+
+    } else {
+        ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
+        v = ggml_cont(ctx0, v);
+
+        const auto n_tokens = q->ne[1];
+        const auto n_head   = q->ne[2];
+
+        ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+        // F32 may not needed for vision encoders?
+        // ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+
+        kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
+
+        ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+        cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+        cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
+    }
+
+    cb(cur, "kqv_out", il);
+
+    if (wo) {
+        cur = ggml_mul_mat(ctx0, wo, cur);
+    }
+
+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
+    }
+
+    return cur;
+}
+
+// implementation of the 2D RoPE without adding a new op in ggml
+// this is not efficient (use double the memory), but works on all backends
+// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
+ggml_tensor * clip_graph::build_rope_2d(
+    ggml_context * ctx0,
+    ggml_tensor * cur,
+    ggml_tensor * pos_a, // first half
+    ggml_tensor * pos_b, // second half
+    const float freq_base,
+    const bool interleave_freq
+) {
+    const int64_t n_dim  = cur->ne[0];
+    const int64_t n_head = cur->ne[1];
+    const int64_t n_pos  = cur->ne[2];
+
+    // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
+    // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
+    // first half of cur will use 1e-0, 1e-2 (even)
+    // second half of cur will use 1e-1, 1e-3 (odd)
+    // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
+    //  ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
+    // then for the second half, we use freq_scale to shift the inv_freq
+    //  ^ why? replace (2i) with (2i+1) in the above equation
+    const float freq_scale_odd = interleave_freq
+                                ? std::pow(freq_base, (float)-2/n_dim)
+                                : 1.0;
+
+    // first half
+    ggml_tensor * first;
+    {
+        first = ggml_view_3d(ctx0, cur,
+            n_dim/2, n_head, n_pos,
+            cur->nb[1],
+            cur->nb[2],
+            0);
+        first = ggml_rope_ext(
+            ctx0,
+            first,
+            pos_a,      // positions
+            nullptr,    // freq factors
+            n_dim/2,    // n_dims
+            0, 0, freq_base,
+            1.0f, 0.0f, 1.0f, 0.0f, 0.0f
+        );
+    }
+
+    // second half
+    ggml_tensor * second;
+    {
+        second = ggml_view_3d(ctx0, cur,
+            n_dim/2, n_head, n_pos,
+            cur->nb[1],
+            cur->nb[2],
+            n_dim/2 * ggml_element_size(cur));
+        second = ggml_rope_ext(
+            ctx0,
+            second,
+            pos_b,      // positions
+            nullptr,    // freq factors
+            n_dim/2,    // n_dims
+            0, 0, freq_base,
+            freq_scale_odd,
+            0.0f, 1.0f, 0.0f, 0.0f
+        );
+    }
+
+    cur = ggml_concat(ctx0, first, second, 0);
+    return cur;
+}
+
+// Generic function to stack frames for audio processing
+// Abstracts out the StackAudioFrames logic used by ultravox
+ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
+    if (stack_factor <= 1) {
+        return cur;
+    }
+
+    int64_t total_elements = ggml_nelements(cur);
+    int64_t stride = n_embed * stack_factor;
+
+    // Calculate padded length
+    int64_t padded_len = GGML_PAD(total_elements, stride);
+    int64_t pad = padded_len - total_elements;
+
+    if (pad > 0) {
+        // Pad the tensor to make it divisible by stride
+        cur = ggml_view_1d(ctx0, cur, total_elements, 0);
+        cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
+    }
+
+    // Reshape to [stride, padded_len / stride]
+    cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
+                        ggml_row_size(cur->type, stride), 0);
+    return cur;
+}
+
+// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
+// support dynamic resolution
+ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
+    GGML_ASSERT(scale_factor > 1);
+
+    const int n_embd = cur->ne[0];
+    int width  = img.nx / patch_size;
+    int height = img.ny / patch_size;
+
+    // pad width and height to factor
+    const int64_t pad_width  = CLIP_ALIGN(width,  scale_factor) - width;
+    const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
+    cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
+    if (pad_width || pad_height) {
+        cur     = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
+        width  += pad_width;
+        height += pad_height;
+    }
+
+    // unshuffle h
+    cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
+    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+
+    // unshuffle w
+    cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
+    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+
+    cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
+    cb(cur, "pixel_shuffle", -1);
+
+    return cur;
+}
+
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
+    GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
+
+    const clip_image_f32 & img = *imgs.entries[0];
+    std::unique_ptr<clip_graph> builder;
+
+    switch (ctx->proj_type()) {
+        case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_IDEFICS3:
+        case PROJECTOR_TYPE_LFM2:
+        case PROJECTOR_TYPE_JANUS_PRO:
+            {
+                builder = std::make_unique<clip_graph_siglip>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_GEMMA3NV:
+            {
+                builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_PIXTRAL:
+        case PROJECTOR_TYPE_LIGHTONOCR:
+            {
+                builder = std::make_unique<clip_graph_pixtral>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+            {
+                builder = std::make_unique<clip_graph_qwen2vl>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_QWEN3VL:
+            {
+                builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                builder = std::make_unique<clip_graph_minicpmv>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_INTERNVL:
+            {
+                builder = std::make_unique<clip_graph_internvl>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_LLAMA4:
+            {
+                builder = std::make_unique<clip_graph_llama4>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_ULTRAVOX:
+        case PROJECTOR_TYPE_VOXTRAL:
+        case PROJECTOR_TYPE_QWEN2A:
+        case PROJECTOR_TYPE_GLMA:
+        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+            {
+                builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_KIMIVL:
+            {
+                builder = std::make_unique<clip_graph_kimivl>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_KIMIK25:
+            {
+                builder = std::make_unique<clip_graph_kimik25>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_COGVLM:
+            {
+                builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_MLP:
+        case PROJECTOR_TYPE_MLP_NORM:
+        case PROJECTOR_TYPE_LDP:
+        case PROJECTOR_TYPE_LDPV2:
+        case PROJECTOR_TYPE_GLM_EDGE:
+            {
+                builder = std::make_unique<clip_graph_llava>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_LFM2A:
+            {
+                builder = std::make_unique<clip_graph_conformer>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_GLM4V:
+            {
+                builder = std::make_unique<clip_graph_glm4v>(ctx, img);
+            } break;
+        case PROJECTOR_TYPE_YOUTUVL:
+            {
+                builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
+            } break;
+        default:
+            GGML_ABORT("missing cgraph builder");
+    }
+
+    return builder->build();
+}
+
+//
+// clip_model_loader
+//
+
+struct clip_model_loader {
+    ggml_context_ptr ctx_meta;
+    gguf_context_ptr ctx_gguf;
+
+    std::string fname;
+
+    size_t model_size = 0; // in bytes
+
+    bool has_vision = false;
+    bool has_audio  = false;
+
+    // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
+    clip_model_loader(const char * fname) : fname(fname) {
+        struct ggml_context * meta = nullptr;
+
+        struct gguf_init_params params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &meta,
+        };
+
+        ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params));
+        if (!ctx_gguf.get()) {
+            throw std::runtime_error(string_format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
+        }
+
+        ctx_meta.reset(meta);
+
+        const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
+
+        // print gguf info
+        {
+            std::string name;
+            get_string(KEY_NAME, name, false);
+            std::string description;
+            get_string(KEY_DESCRIPTION, description, false);
+            LOG_INF("%s: model name:   %s\n",  __func__, name.c_str());
+            LOG_INF("%s: description:  %s\n",  __func__, description.c_str());
+            LOG_INF("%s: GGUF version: %d\n",  __func__, gguf_get_version(ctx_gguf.get()));
+            LOG_INF("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
+            LOG_INF("%s: n_tensors:    %d\n",  __func__, n_tensors);
+            LOG_INF("%s: n_kv:         %d\n",  __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
+            LOG_INF("\n");
+        }
+
+        // modalities
+        {
+            get_bool(KEY_HAS_VISION_ENC, has_vision, false);
+            get_bool(KEY_HAS_AUDIO_ENC,  has_audio,  false);
+
+            if (has_vision) {
+                LOG_INF("%s: has vision encoder\n", __func__);
+            }
+            if (has_audio) {
+                LOG_INF("%s: has audio encoder\n", __func__);
+            }
+        }
+
+        // tensors
+        {
+            for (int i = 0; i < n_tensors; ++i) {
+                const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
+                const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
+                enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i);
+                ggml_tensor * cur = ggml_get_tensor(meta, name);
+                size_t tensor_size = ggml_nbytes(cur);
+                model_size += tensor_size;
+                LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                    __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
+            }
+        }
+    }
+
+    void load_hparams(clip_model & model, clip_modality modality) {
+        auto & hparams = model.hparams;
+        std::string log_ffn_op; // for logging
+
+        // sanity check
+        if (modality == CLIP_MODALITY_VISION) {
+            GGML_ASSERT(has_vision);
+        } else if (modality == CLIP_MODALITY_AUDIO) {
+            GGML_ASSERT(has_audio);
+        }
+        model.modality = modality;
+
+
+        // projector type
+        std::string proj_type;
+        {
+            // default key
+            get_string(KEY_PROJ_TYPE, proj_type, false);
+
+            // for models with mixed modalities
+            if (proj_type.empty()) {
+                if (modality == CLIP_MODALITY_VISION) {
+                    get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
+                } else if (modality == CLIP_MODALITY_AUDIO) {
+                    get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
+                } else {
+                    GGML_ABORT("unknown modality");
+                }
+            }
+
+            model.proj_type = clip_projector_type_from_string(proj_type);
+
+            if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
+                throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
+            }
+
+            // correct arch for multimodal models (legacy method)
+            if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
+                model.proj_type = modality == CLIP_MODALITY_VISION
+                                    ? PROJECTOR_TYPE_QWEN25VL
+                                    : PROJECTOR_TYPE_QWEN2A;
+            }
+        }
+
+        const bool is_vision = model.modality == CLIP_MODALITY_VISION;
+        const bool is_audio  = model.modality == CLIP_MODALITY_AUDIO;
+
+        // other hparams
+        {
+            const char * prefix = is_vision ? "vision" : "audio";
+            get_u32(string_format(KEY_N_EMBD,         prefix), hparams.n_embd);
+            get_u32(string_format(KEY_N_HEAD,         prefix), hparams.n_head);
+            get_u32(string_format(KEY_N_FF,           prefix), hparams.n_ff);
+            get_u32(string_format(KEY_N_BLOCK,        prefix), hparams.n_layer);
+            get_u32(string_format(KEY_PROJ_DIM,       prefix), hparams.projection_dim);
+            get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
+
+            if (is_vision) {
+                get_u32(KEY_IMAGE_SIZE, hparams.image_size);
+                get_u32(KEY_PATCH_SIZE, hparams.patch_size);
+                get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
+                get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
+                get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
+                if (hparams.minicpmv_query_num == 0) {
+                    // Fallback to hardcoded values for legacy models
+                    if (hparams.minicpmv_version == 3) {
+                        hparams.minicpmv_query_num = 64;
+                    } else if (hparams.minicpmv_version == 4) {
+                        hparams.minicpmv_query_num = 64;
+                    } else if (hparams.minicpmv_version == 5) {
+                        hparams.minicpmv_query_num = 64;
+                    } else if (hparams.minicpmv_version == 6) {
+                        hparams.minicpmv_query_num = 64;
+                    } else if (hparams.minicpmv_version == 100045) {
+                        hparams.minicpmv_query_num = 64;
+                    } else {
+                        hparams.minicpmv_query_num = 96;
+                    }
+                }
+            } else if (is_audio) {
+                get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
+                // some hparams are unused, but still need to set to avoid issues
+                hparams.image_size = 0;
+                hparams.patch_size = 1;
+
+            } else {
+                GGML_ASSERT(false && "unknown modality");
+            }
+
+            // for pinpoints, we need to convert it into a list of resolution candidates
+            {
+                std::vector<int> pinpoints;
+                get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false);
+                if (!pinpoints.empty()) {
+                    for (size_t i = 0; i < pinpoints.size(); i += 2) {
+                        hparams.image_res_candidates.push_back({
+                            pinpoints[i],
+                            pinpoints[i+1],
+                        });
+                    }
+                }
+            }
+
+            // default warmup value
+            hparams.warmup_image_size = hparams.image_size;
+
+            hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP
+                                       || model.proj_type == PROJECTOR_TYPE_MLP_NORM
+                                       || model.proj_type == PROJECTOR_TYPE_LDP
+                                       || model.proj_type == PROJECTOR_TYPE_LDPV2;
+
+            {
+                bool use_gelu = false;
+                bool use_silu = false;
+                get_bool(KEY_USE_GELU, use_gelu, false);
+                get_bool(KEY_USE_SILU, use_silu, false);
+                if (use_gelu && use_silu) {
+                    throw std::runtime_error(string_format("%s: both use_gelu and use_silu are set to true\n", __func__));
+                }
+                if (use_gelu) {
+                    hparams.ffn_op = FFN_GELU;
+                    log_ffn_op = "gelu";
+                } else if (use_silu) {
+                    hparams.ffn_op = FFN_SILU;
+                    log_ffn_op = "silu";
+                } else {
+                    hparams.ffn_op = FFN_GELU_QUICK;
+                    log_ffn_op = "gelu_quick";
+                }
+            }
+
+            {
+                std::string mm_patch_merge_type;
+                get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
+                if (mm_patch_merge_type == "spatial_unpad") {
+                    hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
+                }
+            }
+
+            if (is_vision) {
+                int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
+                int idx_std  = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
+                GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
+                GGML_ASSERT(idx_std >= 0  && "image_std not found");
+                const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean);
+                const float * std_data  = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std);
+                for (int i = 0; i < 3; ++i) {
+                    hparams.image_mean[i] = mean_data[i];
+                    hparams.image_std[i]  = std_data[i];
+                }
+            }
+
+            // Load the vision feature layer indices if they are explicitly provided;
+            // if multiple vision feature layers are present, the values will be concatenated
+            // to form the final visual features.
+            // NOTE: gguf conversions should standardize the values of the vision feature layer to
+            // be non-negative, since we use -1 to mark values as unset here.
+            std::vector<int> vision_feature_layer;
+            get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false);
+            // convert std::vector to std::unordered_set
+            for (auto & layer : vision_feature_layer) {
+                hparams.vision_feature_layer.insert(layer);
+            }
+
+            // model-specific params
+            switch (model.proj_type) {
+                case PROJECTOR_TYPE_MINICPMV:
+                    {
+                        if (hparams.minicpmv_version == 0) {
+                            hparams.minicpmv_version = 2; // default to 2 if not set
+                        }
+                    } break;
+                case PROJECTOR_TYPE_INTERNVL:
+                    {
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+                    } break;
+                case PROJECTOR_TYPE_IDEFICS3:
+                    {
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+                        get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
+                    } break;
+                case PROJECTOR_TYPE_LFM2:
+                    {
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+                        // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
+                        hparams.set_limit_image_tokens(64, 256);
+                    } break;
+                case PROJECTOR_TYPE_PIXTRAL:
+                case PROJECTOR_TYPE_LIGHTONOCR:
+                    {
+                        // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
+                        // TODO: verify the image_min_tokens
+                        hparams.n_merge = 1; // the original pixtral does not use patch merging
+                        hparams.rope_theta = 10000.0f;
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        hparams.set_limit_image_tokens(8, 1024);
+                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
+                    } break;
+                case PROJECTOR_TYPE_KIMIVL:
+                    {
+                        hparams.rope_theta = 10000.0f;
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+                        // TODO: check kimivl preprocessor for exact values
+                        hparams.set_limit_image_tokens(8, 1024);
+                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
+                    } break;
+                case PROJECTOR_TYPE_KIMIK25:
+                    {
+                        hparams.rope_theta = 10000.0f;
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+
+                        int min_pixels = 0, max_pixels = 0;
+                        get_u32(KEY_IMAGE_MIN_PIXELS, min_pixels, false);
+                        get_u32(KEY_IMAGE_MAX_PIXELS, max_pixels, false);
+                        if (min_pixels > 0 && max_pixels > 0) {
+                            hparams.image_min_pixels = min_pixels;
+                            hparams.image_max_pixels = max_pixels;
+                            hparams.warmup_image_size = static_cast<int>(std::sqrt(max_pixels));
+                        } else {
+                            hparams.set_limit_image_tokens(2, 4096);
+                        }
+                    } break;
+                case PROJECTOR_TYPE_GEMMA3:
+                    {
+                        // default value (used by all model sizes in gemma 3 family)
+                        // number of patches for each **side** is reduced by a factor of 4
+                        hparams.n_merge = 4;
+                        // test model (tinygemma3) has a different value, we optionally read it
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+                    } break;
+
+                case PROJECTOR_TYPE_GEMMA3NV:
+                    {
+                        // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
+                        // Similar configuration to Gemma3
+                        hparams.n_merge = 1;  // MobileNetV5 handles resizing internally
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+                    } break;
+                case PROJECTOR_TYPE_QWEN2VL:
+                case PROJECTOR_TYPE_QWEN25VL:
+                case PROJECTOR_TYPE_QWEN3VL:
+                    {
+                        hparams.n_merge = 2; // default value for Qwen 2 and 2.5
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
+                        // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
+                        hparams.set_limit_image_tokens(8, 4096);
+                        hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
+                        const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
+                        if (hparams.image_min_pixels < warn_min_pixels) {
+                            LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__);
+                            LOG_WRN("%s: if you encounter problems with accuracy, try adding --image-min-tokens 1024\n", __func__);
+                            LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
+                        }
+                    } break;
+                case PROJECTOR_TYPE_YOUTUVL:
+                    {
+                        hparams.n_merge = 2;
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
+                        std::vector<int> wa_layer_indexes_vec;
+                        get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
+                        for (auto & layer : wa_layer_indexes_vec) {
+                            hparams.wa_layer_indexes.insert(layer);
+                        }
+                        // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
+                        hparams.set_limit_image_tokens(1, 62500);
+                        hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
+                    } break;
+                case PROJECTOR_TYPE_GLM4V:
+                    {
+                        hparams.rope_theta = 10000.0f;
+                        hparams.n_merge = 2; // default value for GLM4-V
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        hparams.set_limit_image_tokens(8, 4096);
+                        hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
+                    } break;
+                case PROJECTOR_TYPE_LLAMA4:
+                    {
+                        hparams.rope_theta = 10000.0f;
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+                        set_llava_uhd_res_candidates(model, 3);
+                    } break;
+                case PROJECTOR_TYPE_ULTRAVOX:
+                case PROJECTOR_TYPE_QWEN2A:
+                case PROJECTOR_TYPE_GLMA:
+                case PROJECTOR_TYPE_VOXTRAL:
+                case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+                    {
+                        bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
+                                             model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
+                                             model.proj_type == PROJECTOR_TYPE_GLMA;
+                        get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
+                        hparams.ffn_op = FFN_GELU_ERF;
+                        log_ffn_op = "gelu_erf"; // temporary solution for logging
+
+                        // audio preprocessing params
+                        hparams.audio_chunk_len    = 30; // in seconds
+                        hparams.audio_sample_rate  = 16000;
+                        hparams.audio_n_fft        = 400;
+                        hparams.audio_window_len   = 400;
+                        hparams.audio_hop_len      = 160;
+                    } break;
+                case PROJECTOR_TYPE_LFM2A:
+                    {
+                        // audio preprocessing params
+                        hparams.audio_chunk_len        = 1; // in seconds
+                        hparams.audio_sample_rate      = 16000;
+                        hparams.audio_n_fft            = 512;
+                        hparams.audio_window_len       = 400;
+                        hparams.audio_hop_len          = 160;
+                    } break;
+                default:
+                    break;
+            }
+
+            // sanity check
+            {
+                if (hparams.image_max_pixels < hparams.image_min_pixels) {
+                    throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
+                }
+            }
+
+            LOG_INF("%s: projector:          %s\n", __func__, proj_type.c_str());
+            LOG_INF("%s: n_embd:             %d\n", __func__, hparams.n_embd);
+            LOG_INF("%s: n_head:             %d\n", __func__, hparams.n_head);
+            LOG_INF("%s: n_ff:               %d\n", __func__, hparams.n_ff);
+            LOG_INF("%s: n_layer:            %d\n", __func__, hparams.n_layer);
+            LOG_INF("%s: ffn_op:             %s\n", __func__, log_ffn_op.c_str());
+            LOG_INF("%s: projection_dim:     %d\n", __func__, hparams.projection_dim);
+            if (is_vision) {
+                LOG_INF("\n--- vision hparams ---\n");
+                LOG_INF("%s: image_size:         %d\n", __func__, hparams.image_size);
+                LOG_INF("%s: patch_size:         %d\n", __func__, hparams.patch_size);
+                LOG_INF("%s: has_llava_proj:     %d\n", __func__, hparams.has_llava_projector);
+                LOG_INF("%s: minicpmv_version:   %d\n", __func__, hparams.minicpmv_version);
+                LOG_INF("%s: n_merge:            %d\n", __func__, hparams.n_merge);
+                LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
+                if (!hparams.wa_layer_indexes.empty()) {
+                    LOG_INF("%s: wa_layer_indexes:  ", __func__);
+                    for (auto & layer : hparams.wa_layer_indexes) {
+                        LOG_INF("%d ", layer);
+                    }
+                    LOG_INF("\n");
+                }
+                if (hparams.image_min_pixels > 0) {
+                    LOG_INF("%s: image_min_pixels:   %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
+                }
+                if (hparams.image_max_pixels > 0) {
+                    LOG_INF("%s: image_max_pixels:   %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > 0 ? " (custom value)" : "");
+                }
+            } else if (is_audio) {
+                LOG_INF("\n--- audio hparams ---\n");
+                LOG_INF("%s: n_mel_bins:         %d\n", __func__, hparams.n_mel_bins);
+                LOG_INF("%s: proj_stack_factor:  %d\n", __func__, hparams.proj_stack_factor);
+                LOG_INF("%s: audio_chunk_len:    %d\n", __func__, hparams.audio_chunk_len);
+                LOG_INF("%s: audio_sample_rate:  %d\n", __func__, hparams.audio_sample_rate);
+                LOG_INF("%s: audio_n_fft:        %d\n", __func__, hparams.audio_n_fft);
+                LOG_INF("%s: audio_window_len:   %d\n", __func__, hparams.audio_window_len);
+                LOG_INF("%s: audio_hop_len:      %d\n", __func__, hparams.audio_hop_len);
+            }
+            LOG_INF("\n");
+            LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
+            LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
+        }
+    }
+
+    void load_tensors(clip_ctx & ctx_clip) {
+        auto & model = ctx_clip.model;
+        auto & hparams = model.hparams;
+        std::map<std::string, size_t> tensor_offset;
+        std::vector<ggml_tensor *> tensors_to_load;
+
+        // TODO @ngxson : support both audio and video in the future
+        const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
+
+        // get offsets
+        for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
+            const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
+            tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i);
+        }
+
+        // create data context
+        struct ggml_init_params params = {
+            /*.mem_size =*/ static_cast<size_t>(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc =*/ true,
+        };
+        ctx_clip.ctx_data.reset(ggml_init(params));
+        if (!ctx_clip.ctx_data) {
+            throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__));
+        }
+
+        // helper function
+        auto get_tensor = [&](const std::string & name, bool required = true) {
+            ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
+            if (!cur && required) {
+                throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str()));
+            }
+            if (cur) {
+                tensors_to_load.push_back(cur);
+                // add tensors to context
+                ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
+                ggml_set_name(data_tensor, cur->name);
+                cur = data_tensor;
+            }
+            return cur;
+        };
+
+        model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
+
+        model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
+        model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"),   false);
+
+        model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
+        model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"),   false);
+
+        model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
+        model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD,   false);
+        model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
+
+        model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false);
+        model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"),   false);
+
+        model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
+
+        if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
+            hparams.n_layer = 0; // gemma3n does not use normal layer structure
+        }
+
+        // layers
+        model.layers.resize(hparams.n_layer);
+        for (int il = 0; il < hparams.n_layer; ++il) {
+            auto & layer = model.layers[il];
+            layer.k_w    = get_tensor(string_format(TN_ATTN_K,      prefix, il, "weight"), false);
+            layer.q_w    = get_tensor(string_format(TN_ATTN_Q,      prefix, il, "weight"), false);
+            layer.v_w    = get_tensor(string_format(TN_ATTN_V,      prefix, il, "weight"), false);
+            layer.o_w    = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
+            layer.qkv_w  = get_tensor(string_format(TN_ATTN_QKV,    prefix, il, "weight"), false);
+            layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
+            layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
+            layer.ln_1_w = get_tensor(string_format(TN_LN_1,        prefix, il, "weight"), false);
+            layer.ln_2_w = get_tensor(string_format(TN_LN_2,        prefix, il, "weight"), false);
+            layer.ls_1_w = get_tensor(string_format(TN_LS_1,        prefix, il, "weight"), false); // no bias
+            layer.ls_2_w = get_tensor(string_format(TN_LS_2,        prefix, il, "weight"), false); // no bias
+
+            layer.k_b    = get_tensor(string_format(TN_ATTN_K,      prefix, il, "bias"), false);
+            layer.q_b    = get_tensor(string_format(TN_ATTN_Q,      prefix, il, "bias"), false);
+            layer.v_b    = get_tensor(string_format(TN_ATTN_V,      prefix, il, "bias"), false);
+            layer.o_b    = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
+            layer.qkv_b  = get_tensor(string_format(TN_ATTN_QKV,    prefix, il, "bias"), false);
+            layer.ln_1_b = get_tensor(string_format(TN_LN_1,        prefix, il, "bias"), false);
+            layer.ln_2_b = get_tensor(string_format(TN_LN_2,        prefix, il, "bias"), false);
+
+            // ffn
+            layer.ff_up_w   = get_tensor(string_format(TN_FFN_UP,   prefix, il, "weight"));
+            layer.ff_up_b   = get_tensor(string_format(TN_FFN_UP,   prefix, il, "bias"),   false);
+            layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false);
+            layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"),   false);
+            layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
+            layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"),   false);
+
+
+            // qwen3vl deepstack layer
+            layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false);
+            layer.deepstack_norm_b = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "bias"), false);
+            layer.deepstack_fc1_w  = get_tensor(string_format(TN_DEEPSTACK_FC1,  il, "weight"), false);
+            layer.deepstack_fc1_b  = get_tensor(string_format(TN_DEEPSTACK_FC1,  il, "bias"), false);
+            layer.deepstack_fc2_w  = get_tensor(string_format(TN_DEEPSTACK_FC2,  il, "weight"), false);
+            layer.deepstack_fc2_b  = get_tensor(string_format(TN_DEEPSTACK_FC2,  il, "bias"), false);
+            if (layer.has_deepstack()) {
+                model.n_deepstack_layers++;
+            }
+
+            // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
+            // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
+            bool is_ffn_swapped = (
+                    // only old models need this fix
+                    model.proj_type == PROJECTOR_TYPE_MLP
+                    || model.proj_type == PROJECTOR_TYPE_MLP_NORM
+                    || model.proj_type == PROJECTOR_TYPE_LDP
+                    || model.proj_type == PROJECTOR_TYPE_LDPV2
+                    || model.proj_type == PROJECTOR_TYPE_QWEN2VL
+                    || model.proj_type == PROJECTOR_TYPE_QWEN25VL
+                    || model.proj_type == PROJECTOR_TYPE_GLM_EDGE
+                    || model.proj_type == PROJECTOR_TYPE_GEMMA3
+                    || model.proj_type == PROJECTOR_TYPE_IDEFICS3
+                    || model.proj_type == PROJECTOR_TYPE_MINICPMV
+                ) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
+            if (is_ffn_swapped) {
+                // swap up and down weights
+                ggml_tensor * tmp = layer.ff_up_w;
+                layer.ff_up_w = layer.ff_down_w;
+                layer.ff_down_w = tmp;
+                // swap up and down biases
+                tmp = layer.ff_up_b;
+                layer.ff_up_b = layer.ff_down_b;
+                layer.ff_down_b = tmp;
+                if (il == 0) {
+                    LOG_WRN("%s: ffn up/down are swapped\n", __func__);
+                }
+            }
+        }
+
+
+        switch (model.proj_type) {
+            case PROJECTOR_TYPE_MLP:
+            case PROJECTOR_TYPE_MLP_NORM:
+                {
+                    // LLaVA projection
+                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
+                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
+                    // Yi-type llava
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
+                    // missing in Yi-type llava
+                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
+                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
+                    // Yi-type llava
+                    model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
+                    model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
+                    model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
+                    model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
+                    if (model.mm_3_w) {
+                        // TODO: this is a hack to support Yi-type llava
+                        model.proj_type = PROJECTOR_TYPE_MLP_NORM;
+                    }
+                    model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
+                } break;
+            case PROJECTOR_TYPE_LDP:
+                {
+                    // MobileVLM projection
+                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+                    model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
+                    model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
+                    model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
+                    model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
+                    model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
+                    model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
+                    model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
+                    model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
+                    model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
+                    model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
+                    model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
+                    model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
+                    model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
+                    model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
+                    model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
+                    model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
+                    model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
+                    model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
+                    model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
+                    model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
+                    model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
+                    model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
+                    model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
+                } break;
+            case PROJECTOR_TYPE_LDPV2:
+                {
+                    // MobilVLM_V2 projection
+                    model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
+                    model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
+                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
+                    model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
+                    model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
+                    model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
+                } break;
+            case PROJECTOR_TYPE_MINICPMV:
+                {
+                    // model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
+                    model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
+                    model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
+                    model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
+                    model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
+                    model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
+                    model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
+                    model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
+                    model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
+                    model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
+                    model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
+                    model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
+                    model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
+                    model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
+                    model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
+                    model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
+                    model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
+                    model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
+                    model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
+                } break;
+            case PROJECTOR_TYPE_GLM_EDGE:
+                {
+                    model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
+                    model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
+                    model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
+                    model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
+                    model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
+                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
+                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
+                    model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
+                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
+                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
+                } break;
+            case PROJECTOR_TYPE_QWEN2VL:
+            case PROJECTOR_TYPE_QWEN25VL:
+                {
+                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                } break;
+            case PROJECTOR_TYPE_QWEN3VL:
+                {
+                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                } break;
+            case PROJECTOR_TYPE_YOUTUVL:
+                {
+                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);        // merger.ln_q (RMS norm)
+                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));  // merger.mlp.0
+                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));  // merger.mlp.2
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                } break;
+            case PROJECTOR_TYPE_GLM4V:
+                {
+                    model.projection     = get_tensor(TN_MM_PROJECTOR);
+                    model.mm_ffn_up_w    = get_tensor(string_format(TN_MM_UP,        "weight"));
+                    model.mm_ffn_up_b    = get_tensor(string_format(TN_MM_UP,        "bias"), false);
+                    model.mm_ffn_gate_w  = get_tensor(string_format(TN_MM_GATE,      "weight"));
+                    model.mm_ffn_gate_b  = get_tensor(string_format(TN_MM_GATE,      "bias"), false);
+                    model.mm_ffn_down_w  = get_tensor(string_format(TN_MM_DOWN,      "weight"));
+                    model.mm_ffn_down_b  = get_tensor(string_format(TN_MM_DOWN,      "bias"), false);
+                    model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
+                    model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false);
+                    model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"));
+                    model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias"));
+                } break;
+            case PROJECTOR_TYPE_GEMMA3:
+                {
+                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
+                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
+                } break;
+            case PROJECTOR_TYPE_GEMMA3NV:
+                {
+                    model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
+                    model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
+                    model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
+
+                    model.msfa_ffn_expand_w  = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
+                    model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
+                    model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
+                    model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
+
+                    model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
+
+                    // Dynamically load blocks stage by stage
+                    for (int stage = 0; stage < 4; ++stage) {
+                        int blocks_found_in_stage = 0;
+
+                        for (int blk_idx = 0; ; ++blk_idx) {
+                            bool found_block = false;
+                            mobilenetv5_block block;
+
+                            // 1. Check for Edge Residual (S0)
+                            block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
+                            if (block.s0_conv_exp_w) {
+                                found_block = true;
+                                block.s0_bn1_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
+                                block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
+                                block.s0_bn2_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
+                            }
+                            // 2. Check for UIR (Universal Inverted Residual)
+                            else {
+                                // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
+                                block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
+                                block.pw_exp_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
+
+                                if (block.dw_start_w || block.pw_exp_w) {
+                                    found_block = true;
+                                    if (block.dw_start_w) {
+                                        block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
+                                    }
+                                    if (block.pw_exp_w) {
+                                        block.pw_exp_bn_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
+                                    }
+                                    block.dw_mid_w      = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
+                                    if (block.dw_mid_w) {
+                                        block.dw_mid_bn_w   = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
+                                    }
+                                    block.pw_proj_w     = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
+                                    if (block.pw_proj_w) {
+                                        block.pw_proj_bn_w  = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
+                                    }
+                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
+                                }
+                            }
+
+                            // 3. Check for Attention (MQA)
+                            // Even if UIR/Edge check failed, this might be a pure attention block
+                            ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
+                            if (attn_q_check) {
+                                found_block = true;
+                                block.attn_q_w = attn_q_check;
+                                block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
+                                block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
+                                block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
+                                block.attn_k_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
+                                block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
+                                block.attn_v_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
+                                block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
+                                block.attn_norm_w   = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
+                                // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
+                                if (!block.layer_scale_w) {
+                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
+                                }
+                            }
+
+                            if (found_block) {
+                                model.mobilenet_blocks.push_back(block);
+                                blocks_found_in_stage++;
+                            } else {
+                                // End of blocks for this stage
+                                break;
+                            }
+                        }
+
+                        // Track where this stage ends in the flat vector
+                        if (blocks_found_in_stage > 0) {
+                            model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
+                            LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
+                        }
+                    }
+                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
+                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
+                } break;
+            case PROJECTOR_TYPE_IDEFICS3:
+                {
+                    model.projection = get_tensor(TN_MM_PROJECTOR);
+                } break;
+            case PROJECTOR_TYPE_LFM2:
+                {
+                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
+                    model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
+                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                } break;
+            case PROJECTOR_TYPE_KIMIVL:
+            case PROJECTOR_TYPE_KIMIK25:
+                {
+                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
+                    model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
+                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                } break;
+            case PROJECTOR_TYPE_PIXTRAL:
+                {
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
+                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
+                    // [IMG_BREAK] token embedding
+                    model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
+                    // for mistral small 3.1
+                    model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM, false);
+                    model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
+                } break;
+            case PROJECTOR_TYPE_LIGHTONOCR:
+                {
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
+                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
+                    model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM, false);
+                    model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
+                } break;
+            case PROJECTOR_TYPE_ULTRAVOX:
+                {
+                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
+                    model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
+                    model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
+                } break;
+            case PROJECTOR_TYPE_QWEN2A:
+                {
+                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+                    model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
+                    model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
+                } break;
+            case PROJECTOR_TYPE_VOXTRAL:
+                {
+                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
+                } break;
+            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+                {
+                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
+                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
+                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
+                } break;
+            case PROJECTOR_TYPE_INTERNVL:
+                {
+                    model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
+                    model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
+                    model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
+                    model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
+                } break;
+            case PROJECTOR_TYPE_GLMA:
+                {
+                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
+                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
+                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
+                    model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
+                    model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
+                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
+                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
+                } break;
+            case PROJECTOR_TYPE_LLAMA4:
+                {
+                    model.mm_model_proj    = get_tensor(TN_MM_PROJECTOR);
+                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
+                } break;
+            case PROJECTOR_TYPE_COGVLM:
+                {
+                    model.mm_model_proj     = get_tensor(TN_MM_PROJECTOR);
+                    model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight"));
+                    model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias"));
+                    model.mm_h_to_4h_w      = get_tensor(string_format(TN_MM_H_TO_4H,      "weight"));
+                    model.mm_gate_w         = get_tensor(string_format(TN_MM_GATE,         "weight"));
+                    model.mm_4h_to_h_w      = get_tensor(string_format(TN_MM_4H_TO_H,      "weight"));
+                    model.mm_boi            = get_tensor(TN_TOK_BOI);
+                    model.mm_eoi            = get_tensor(TN_TOK_EOI);
+                } break;
+            case PROJECTOR_TYPE_JANUS_PRO:
+                {
+                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
+                } break;
+            case PROJECTOR_TYPE_LFM2A:
+                {
+                    for (int i : {0, 2, 3, 5, 6}) {
+                        model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight"));
+                        model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias"));
+                    }
+                    model.pre_encode_out_w    = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight"));
+                    model.pre_encode_out_b    = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias"));
+
+                    model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
+                    model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
+                    model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
+                    model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
+
+                    for (int il = 0; il < hparams.n_layer; ++il) {
+                        auto & layer = model.layers[il];
+
+                        layer.ff_norm_w   = get_tensor(string_format(TN_FFN_NORM,   prefix, il, "weight"));
+                        layer.ff_norm_b   = get_tensor(string_format(TN_FFN_NORM,   prefix, il, "bias"));
+                        layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
+                        layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
+                        layer.ff_up_1_w   = get_tensor(string_format(TN_FFN_UP_1,   prefix, il, "weight"));
+                        layer.ff_up_1_b   = get_tensor(string_format(TN_FFN_UP_1,   prefix, il, "bias"));
+                        layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
+                        layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
+
+                        layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il));
+                        layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il));
+
+                        layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
+                        layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
+
+                        layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight"));
+
+                        layer.conv_norm_w  = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"));
+                        layer.conv_norm_b  = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"));
+                        layer.conv_dw_w    = get_tensor(string_format(TN_CONV_DW,   prefix, il, "weight"));
+                        layer.conv_dw_b    = get_tensor(string_format(TN_CONV_DW,   prefix, il, "bias"));
+                        layer.conv_pw1_w   = get_tensor(string_format(TN_CONV_PW1,  prefix, il, "weight"));
+                        layer.conv_pw1_b   = get_tensor(string_format(TN_CONV_PW1,  prefix, il, "bias"));
+                        layer.conv_pw2_w   = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "weight"));
+                        layer.conv_pw2_b   = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "bias"));
+                    }
+                } break;
+            default:
+                GGML_ASSERT(false && "unknown projector type");
+        }
+
+        // load data
+        {
+            std::vector<uint8_t> read_buf;
+
+            auto fin = std::ifstream(fname, std::ios::binary);
+            if (!fin) {
+                throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
+            }
+
+            // alloc memory and offload data
+            ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
+            ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
+            ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            for (auto & t : tensors_to_load) {
+                ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
+                const size_t offset = tensor_offset[t->name];
+                fin.seekg(offset, std::ios::beg);
+                if (!fin) {
+                    throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
+                }
+                size_t num_bytes = ggml_nbytes(cur);
+                if (ggml_backend_buft_is_host(buft)) {
+                    // for the CPU and Metal backend, we can read directly into the tensor
+                    fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
+                } else {
+                    // read into a temporary buffer first, then copy to device memory
+                    read_buf.resize(num_bytes);
+                    fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
+                    ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+                }
+            }
+            fin.close();
+
+            LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
+        }
+    }
+
+    struct support_info_op {
+        ggml_tensor * op;
+
+        // true if the op runs on the accelerated ctx_clip.backend
+        bool is_accel = true;
+    };
+
+    struct support_info_graph {
+        // whether the clip_ctx.backend supports flash attention
+        bool fattn = true;
+        ggml_tensor * fattn_op = nullptr; // for debugging
+
+        std::vector<support_info_op> ops;
+    };
+
+    static void warmup(clip_ctx & ctx_clip) {
+        // create a fake batch
+        const auto & hparams = ctx_clip.model.hparams;
+        clip_image_f32_batch batch;
+        clip_image_f32_ptr img(clip_image_f32_init());
+        if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
+            img->nx = hparams.warmup_image_size;
+            img->ny = hparams.warmup_image_size;
+            LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
+        } else {
+            img->nx = hparams.warmup_audio_size;
+            img->ny = hparams.n_mel_bins;
+            LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
+        }
+        batch.entries.push_back(std::move(img));
+        warmup(ctx_clip, batch);
+    }
+
+    static void warmup(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
+        support_info_graph info;
+
+        if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
+            // try to enable flash attention to see if it's supported
+            ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
+            info = alloc_compute_meta(ctx_clip, batch);
+            if (!info.fattn && info.fattn_op) {
+                auto op = info.fattn_op;
+                LOG_WRN("%s: *****************************************************************\n", __func__);
+                LOG_WRN("%s: WARNING: flash attention not supported by %s, memory usage will increase\n", __func__, ggml_backend_name(ctx_clip.backend));
+                LOG_WRN("%s: op params: \n", __func__);
+                static auto print_shape = [](const char * fn, const char * name, ggml_tensor * t) {
+                    LOG_WRN("%s:   %s: type = %s, ne = [%d %d %d %d], nb = [%d %d %d %d]\n", fn,
+                            name, ggml_type_name(t->type),
+                            t->ne[0], t->ne[1], t->ne[2], t->ne[3],
+                            t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
+                };
+                print_shape(__func__, " dst", op);
+                print_shape(__func__, "src0", op->src[0]);
+                print_shape(__func__, "src1", op->src[1]);
+                print_shape(__func__, "src2", op->src[2]);
+                LOG_WRN("%s: please report this on github as an issue\n", __func__);
+                LOG_WRN("%s: *****************************************************************\n", __func__);
+                ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
+                alloc_compute_meta(ctx_clip, batch);
+            }
+        } else {
+            info = alloc_compute_meta(ctx_clip, batch);
+            if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+                LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
+            }
+        }
+
+        ctx_clip.is_allocated = true; // mark buffers as allocated
+
+        LOG_INF("%s: flash attention is %s\n", __func__,
+            (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
+
+        // print ops that are not supported by the GPU backend (if there is one)
+        if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) {
+            std::vector<support_info_op> unsupported_ops;
+            for (const auto & op : info.ops) {
+                if (!op.is_accel) {
+                    unsupported_ops.push_back(op);
+                }
+            }
+            if (!unsupported_ops.empty()) {
+                LOG_WRN("%s: *****************************************************************\n", __func__);
+                LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__);
+                LOG_WRN("%s:          the performance will be suboptimal                      \n", __func__);
+                LOG_WRN("%s:          list of unsupported ops (backend=%s):\n", __func__, ggml_backend_name(ctx_clip.backend));
+                for (const auto & op : unsupported_ops) {
+                    LOG_WRN("%s: %16s: type = %s, ne = [%d %d %d %d]\n", __func__,
+                            ggml_op_name(op.op->op),
+                            ggml_type_name(op.op->type),
+                            op.op->ne[0], op.op->ne[1], op.op->ne[2], op.op->ne[3]);
+                }
+                LOG_WRN("%s: flash attention is %s\n", __func__,
+                    (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
+                LOG_WRN("%s: please report this on github as an issue\n", __func__);
+                LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__);
+                LOG_WRN("%s: *****************************************************************\n", __func__);
+            }
+        }
+    }
+
+    static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
+        ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
+
+        ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
+        ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
+
+        for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
+            ggml_backend_t backend = ctx_clip.backend_ptrs[i];
+            ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
+            size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend);
+            if (size > 1) {
+                LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+                        ggml_backend_buft_name(buft),
+                        size / 1024.0 / 1024.0);
+            }
+        }
+
+        const int n_splits = ggml_backend_sched_get_n_splits(ctx_clip.sched.get());
+        const int n_nodes  = ggml_graph_n_nodes(gf);
+
+        LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__,  n_splits, n_nodes);
+
+        support_info_graph res {
+            /*.fattn    = */ true,
+            /*.fattn_op = */ nullptr,
+            /*.ops      = */ {},
+        };
+
+        // check op support
+        for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
+            ggml_tensor * node = ggml_graph_node(gf, i);
+            res.ops.push_back({node, true});
+            if (!ggml_backend_supports_op(ctx_clip.backend, node)) {
+                res.ops.back().is_accel = false;
+                if (node->op == GGML_OP_FLASH_ATTN_EXT) {
+                    res.fattn    = false;
+                    res.fattn_op = node;
+                }
+            }
+        }
+
+        return res;
+    }
+
+    void get_bool(const std::string & key, bool & output, bool required = true) const {
+        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+        if (i < 0) {
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
+            return;
+        }
+        output = gguf_get_val_bool(ctx_gguf.get(), i);
+    }
+
+    void get_i32(const std::string & key, int & output, bool required = true) const {
+        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+        if (i < 0) {
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
+            return;
+        }
+        output = gguf_get_val_i32(ctx_gguf.get(), i);
+    }
+
+    void get_u32(const std::string & key, int & output, bool required = true) const {
+        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+        if (i < 0) {
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
+            return;
+        }
+        output = gguf_get_val_u32(ctx_gguf.get(), i);
+    }
+
+    void get_f32(const std::string & key, float & output, bool required = true) const {
+        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+        if (i < 0) {
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
+            return;
+        }
+        output = gguf_get_val_f32(ctx_gguf.get(), i);
+    }
+
+    void get_string(const std::string & key, std::string & output, bool required = true) const {
+        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+        if (i < 0) {
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
+            return;
+        }
+        output = std::string(gguf_get_val_str(ctx_gguf.get(), i));
+    }
+
+    void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) const {
+        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
+        if (i < 0) {
+            if (required) {
+                throw std::runtime_error("Key not found: " + key);
+            }
+            return;
+        }
+        int n = gguf_get_arr_n(ctx_gguf.get(), i);
+        output.resize(n);
+        const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i);
+        for (int i = 0; i < n; ++i) {
+            output[i] = values[i];
+        }
+    }
+
+    static void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
+        auto & hparams = model.hparams;
+        for (int x = 1; x <= max_patches_per_side; x++) {
+            for (int y = 1; y <= max_patches_per_side; y++) {
+                if (x == 1 && y == 1) {
+                    continue; // skip the first point
+                }
+                hparams.image_res_candidates.push_back(clip_image_size{
+                    x*hparams.image_size,
+                    y*hparams.image_size,
+                });
+            }
+        }
+    }
+};
+
+struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
+    clip_ctx * ctx_vision = nullptr;
+    clip_ctx * ctx_audio = nullptr;
+
+    try {
+        clip_model_loader loader(fname);
+        bool skip_audio = false;
+
+        if (loader.has_vision) {
+            ctx_vision = new clip_ctx(ctx_params);
+            loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
+            loader.load_tensors(*ctx_vision);
+            if (ctx_params.warmup) {
+                loader.warmup(*ctx_vision);
+            }
+
+            // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
+            // we can remove this check when we implement audio support for Gemma 3N
+            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
+
+            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
+        }
+
+        if (loader.has_audio && !skip_audio) {
+            ctx_audio = new clip_ctx(ctx_params);
+            loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
+            loader.load_tensors(*ctx_audio);
+            if (ctx_params.warmup) {
+                loader.warmup(*ctx_audio);
+            }
+        }
+
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
+
+        delete ctx_vision;
+        delete ctx_audio;
+
+        return {nullptr, nullptr};
+    }
+
+    return {ctx_vision, ctx_audio};
+}
+
+struct clip_image_size * clip_image_size_init() {
+    struct clip_image_size * load_image_size = new struct clip_image_size();
+    load_image_size->width = 448;
+    load_image_size->height = 448;
+    return load_image_size;
+}
+
+struct clip_image_u8 * clip_image_u8_init() {
+    return new clip_image_u8();
+}
+
+struct clip_image_f32 * clip_image_f32_init() {
+    return new clip_image_f32();
+}
+
+struct clip_image_f32_batch * clip_image_f32_batch_init() {
+    return new clip_image_f32_batch();
+}
+
+unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
+    if (nx) *nx = img->nx;
+    if (ny) *ny = img->ny;
+    return img->buf.data();
+}
+
+void clip_image_size_free(struct clip_image_size * load_image_size) {
+    if (load_image_size == nullptr) {
+        return;
+    }
+    delete load_image_size;
+}
+void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
+void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
+void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; }
+void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; }
+
+size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
+    return batch->entries.size();
+}
+
+size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
+    if (idx < 0 || idx >= (int)batch->entries.size()) {
+        LOG_ERR("%s: invalid index %d\n", __func__, idx);
+        return 0;
+    }
+    return batch->entries[idx]->nx;
+}
+
+size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
+    if (idx < 0 || idx >= (int)batch->entries.size()) {
+        LOG_ERR("%s: invalid index %d\n", __func__, idx);
+        return 0;
+    }
+    return batch->entries[idx]->ny;
+}
+
+clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
+    if (idx < 0 || idx >= (int)batch->entries.size()) {
+        LOG_ERR("%s: invalid index %d\n", __func__, idx);
+        return nullptr;
+    }
+    return batch->entries[idx].get();
+}
+
+void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
+    img->nx = nx;
+    img->ny = ny;
+    img->buf.resize(3 * nx * ny);
+    memcpy(img->buf.data(), rgb_pixels, img->buf.size());
+}
+
+// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
+static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
+    dst.nx = src.nx;
+    dst.ny = src.ny;
+    dst.buf.resize(src.buf.size());
+
+    // TODO @ngxson : seems like this could be done more efficiently on cgraph
+    for (size_t i = 0; i < src.buf.size(); ++i) {
+        int c = i % 3; // rgb
+        dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
+    }
+}
+
+// set of tools to manupulate images
+// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
+struct img_tool {
+    enum resize_algo {
+        RESIZE_ALGO_BILINEAR,
+        RESIZE_ALGO_BICUBIC,
+        // RESIZE_ALGO_LANCZOS, // TODO
+    };
+
+    static void resize(
+            const clip_image_u8 & src,
+            clip_image_u8 & dst,
+            const clip_image_size & target_resolution,
+            resize_algo algo,
+            bool add_padding = true, // TODO: define the behavior for add_padding = false
+            std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
+        dst.nx = target_resolution.width;
+        dst.ny = target_resolution.height;
+        dst.buf.resize(3 * dst.nx * dst.ny);
+
+        if (dst.nx == src.nx && dst.ny == src.ny) {
+            // no resize needed, simple copy
+            dst.buf = src.buf;
+            return;
+        }
+
+        if (!add_padding) {
+            // direct resize
+            switch (algo) {
+                case RESIZE_ALGO_BILINEAR:
+                    resize_bilinear(src, dst, target_resolution.width, target_resolution.height);
+                    break;
+                case RESIZE_ALGO_BICUBIC:
+                    resize_bicubic(src, dst, target_resolution.width, target_resolution.height);
+                    break;
+                default:
+                    throw std::runtime_error("Unsupported resize algorithm");
+            }
+        } else {
+            // resize with padding
+            clip_image_u8 resized_image;
+            float scale_w = static_cast<float>(target_resolution.width) / src.nx;
+            float scale_h = static_cast<float>(target_resolution.height) / src.ny;
+            float scale = std::min(scale_w, scale_h);
+            int new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
+            int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
+
+            switch (algo) {
+                case RESIZE_ALGO_BILINEAR:
+                    resize_bilinear(src, resized_image, new_width, new_height);
+                    break;
+                case RESIZE_ALGO_BICUBIC:
+                    resize_bicubic(src, resized_image, new_width, new_height);
+                    break;
+                default:
+                    throw std::runtime_error("Unsupported resize algorithm");
+            }
+
+            // fill dst with pad_color
+            fill(dst, pad_color);
+
+            int offset_x = (target_resolution.width  - new_width)  / 2;
+            int offset_y = (target_resolution.height - new_height) / 2;
+
+            composite(dst, resized_image, offset_x, offset_y);
+        }
+    }
+
+    static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
+        dst.nx = w;
+        dst.ny = h;
+        dst.buf.resize(3 * w * h);
+
+        for (int i = 0; i < h; ++i) {
+            for (int j = 0; j < w; ++j) {
+                int src_idx = 3 * ((y + i)*image.nx + (x + j));
+                int dst_idx = 3 * (i*w + j);
+                dst.buf[dst_idx]     = image.buf[src_idx];
+                dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
+                dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
+            }
+        }
+    }
+
+    // calculate the size of the **resized** image, while preserving the aspect ratio
+    // the calculated size will be aligned to the nearest multiple of align_size
+    // if H or W size is larger than longest_edge, it will be resized to longest_edge
+    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
+        GGML_ASSERT(align_size > 0);
+        if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) {
+            return {0, 0};
+        }
+
+        float scale = std::min(static_cast<float>(longest_edge) / inp_size.width,
+                               static_cast<float>(longest_edge) / inp_size.height);
+
+        float target_width_f  = static_cast<float>(inp_size.width)  * scale;
+        float target_height_f = static_cast<float>(inp_size.height) * scale;
+
+        auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
+        int aligned_width  = ceil_by_factor(target_width_f);
+        int aligned_height = ceil_by_factor(target_height_f);
+
+        return {aligned_width, aligned_height};
+    }
+
+    // calculate the size of the **resized** image, while preserving the aspect ratio
+    // the calculated size will have min_pixels <= W*H <= max_pixels
+    // this is referred as "smart_resize" in transformers code
+    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
+        GGML_ASSERT(align_size > 0);
+        const int width  = inp_size.width;
+        const int height = inp_size.height;
+
+        auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::round(x / static_cast<float>(f))) * f; };
+        auto ceil_by_factor  = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
+        auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
+
+        // always align up first
+        int h_bar = std::max(align_size, round_by_factor(height));
+        int w_bar = std::max(align_size, round_by_factor(width));
+
+        if (h_bar * w_bar > max_pixels) {
+            const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
+            h_bar = std::max(align_size, floor_by_factor(height / beta));
+            w_bar = std::max(align_size, floor_by_factor(width  / beta));
+        } else if (h_bar * w_bar < min_pixels) {
+            const auto beta = std::sqrt(static_cast<float>(min_pixels) / (height * width));
+            h_bar = ceil_by_factor(height * beta);
+            w_bar = ceil_by_factor(width * beta);
+        }
+
+        return {w_bar, h_bar};
+    }
+
+    // draw src image into dst image at offset (offset_x, offset_y)
+    static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
+        for (int y = 0; y < src.ny; ++y) {
+            for (int x = 0; x < src.nx; ++x) {
+                int dx = x + offset_x;
+                int dy = y + offset_y;
+                // skip pixels that would be out of bounds in the destination
+                if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
+                    continue;
+                }
+                size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
+                size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
+                dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
+                dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
+                dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
+            }
+        }
+    }
+
+    // fill the image with a solid color
+    static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
+        for (size_t i = 0; i < img.buf.size(); i += 3) {
+            img.buf[i]     = color[0];
+            img.buf[i + 1] = color[1];
+            img.buf[i + 2] = color[2];
+        }
+    }
+
+private:
+    // Bilinear resize function
+    static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
+        dst.nx = target_width;
+        dst.ny = target_height;
+        dst.buf.resize(3 * target_width * target_height);
+
+        float x_ratio = static_cast<float>(src.nx - 1) / target_width;
+        float y_ratio = static_cast<float>(src.ny - 1) / target_height;
+
+        for (int y = 0; y < target_height; y++) {
+            for (int x = 0; x < target_width; x++) {
+                float px = x_ratio * x;
+                float py = y_ratio * y;
+                int x_floor = static_cast<int>(px);
+                int y_floor = static_cast<int>(py);
+                float x_lerp = px - x_floor;
+                float y_lerp = py - y_floor;
+
+                for (int c = 0; c < 3; c++) {
+                    float top = lerp(
+                        static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
+                        static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
+                        x_lerp
+                    );
+                    float bottom = lerp(
+                        static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
+                        static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
+                        x_lerp
+                    );
+                    dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
+                }
+            }
+        }
+    }
+
+    // Bicubic resize function
+    // part of image will be cropped if the aspect ratio is different
+    static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
+        const int nx = img.nx;
+        const int ny = img.ny;
+
+        dst.nx = target_width;
+        dst.ny = target_height;
+        dst.buf.resize(3 * target_width * target_height);
+
+        float Cc;
+        float C[5] = {};
+        float d0, d2, d3, a0, a1, a2, a3;
+        int i, j, k, jj;
+        int x, y;
+        float dx, dy;
+        float tx, ty;
+
+        tx = (float)nx / (float)target_width;
+        ty = (float)ny / (float)target_height;
+
+        // Bicubic interpolation; adapted from ViT.cpp, inspired from :
+        //    -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
+        //    -> https://en.wikipedia.org/wiki/Bicubic_interpolation
+
+        for (i = 0; i < target_height; i++) {
+            for (j = 0; j < target_width; j++) {
+                x = (int)(tx * j);
+                y = (int)(ty * i);
+
+                dx = tx * j - x;
+                dy = ty * i - y;
+
+                for (k = 0; k < 3; k++) {
+                    for (jj = 0; jj <= 3; jj++) {
+                        d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                        d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                        d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                        a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+
+                        a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+                        a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
+                        a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
+
+                        C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
+
+                        d0 = C[0] - C[1];
+                        d2 = C[2] - C[1];
+                        d3 = C[3] - C[1];
+                        a0 = C[1];
+                        a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+                        a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
+                        a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
+                        Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
+
+                        const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
+                        dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+    static inline int clip(int x, int lower, int upper) {
+        return std::max(lower, std::min(x, upper));
+    }
+
+    // Linear interpolation between two points
+    static inline float lerp(float s, float e, float t) {
+        return s + (e - s) * t;
+    }
+};
+
+/**
+ * implementation of LLaVA-UHD:
+ *  - https://arxiv.org/pdf/2403.11703
+ *  - https://github.com/thunlp/LLaVA-UHD
+ *  - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
+ *
+ * overview:
+ *   - an image always have a single overview (downscaled image)
+ *   - an image can have 0 or multiple slices, depending on the image size
+ *   - each slice can then be considered as a separate image
+ *
+ * for example:
+ *
+ * [overview] --> [slice 1] --> [slice 2]
+ *           |                |
+ *           +--> [slice 3] --> [slice 4]
+ */
+struct llava_uhd {
+    struct slice_coordinates {
+        int x;
+        int y;
+        clip_image_size size;
+    };
+
+    struct slice_instructions {
+        clip_image_size overview_size; // size of downscaled image
+        clip_image_size refined_size;  // size of image right before slicing (must be multiple of slice size)
+        clip_image_size grid_size;     // grid_size.width * grid_size.height = number of slices
+        std::vector<slice_coordinates> slices;
+
+        img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
+        bool padding_overview = false;  // if true, refine image will be padded to the grid size (e.g. llava-1.6)
+        std::array<uint8_t, 3> pad_color_overview = {0, 0, 0};
+
+        img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC;
+        bool padding_refined = false;  // if true, refine image will be padded to the grid size (e.g. llava-1.6)
+        std::array<uint8_t, 3> pad_color_refined = {0, 0, 0};
+    };
+
+    static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
+        slice_instructions res;
+        const int patch_size      = clip_get_patch_size(ctx);
+        const int slice_size      = clip_get_image_size(ctx);
+        const int original_width  = original_size.width;
+        const int original_height = original_size.height;
+
+        const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
+        const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
+
+        if (!has_slices) {
+            // skip slicing logic
+            res.overview_size = clip_image_size{slice_size, slice_size};
+            res.refined_size  = clip_image_size{0, 0};
+            res.grid_size     = clip_image_size{0, 0};
+
+            return res;
+        }
+
+        if (has_pinpoints) {
+            // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
+            auto refine_size = llava_uhd::select_best_resolution(
+                original_size,
+                ctx->model.hparams.image_res_candidates);
+            res.overview_size         = clip_image_size{slice_size, slice_size};
+            res.refined_size          = refine_size;
+            res.grid_size             = clip_image_size{0, 0};
+            res.padding_refined       = true;
+            res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR;  // preserve old behavior when padding
+
+            LOG_DBG("%s: using pinpoints for slicing\n", __func__);
+            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
+                    __func__, original_width, original_height,
+                    res.overview_size.width, res.overview_size.height,
+                    res.refined_size.width,  res.refined_size.height);
+
+            for (int y = 0; y < refine_size.height; y += slice_size) {
+                for (int x = 0; x < refine_size.width; x += slice_size) {
+                    slice_coordinates slice;
+                    slice.x = x;
+                    slice.y = y;
+                    slice.size.width  = std::min(slice_size, refine_size.width  - x);
+                    slice.size.height = std::min(slice_size, refine_size.height - y);
+                    res.slices.push_back(slice);
+                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+                            __func__, (int)res.slices.size() - 1,
+                            slice.x, slice.y, slice.size.width, slice.size.height);
+                }
+            }
+
+            res.grid_size.height = refine_size.height / slice_size;
+            res.grid_size.width  = refine_size.width  / slice_size;
+            LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
+
+            return res;
+        }
+
+        // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
+
+        auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
+        res.overview_size = best_size;
+
+        {
+            const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
+            const float log_ratio = log((float)original_width / original_height);
+            const float ratio = (float)original_width * original_height / (slice_size * slice_size);
+            const int multiple = fmin(ceil(ratio), max_slice_nums);
+
+            auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
+            auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
+            res.grid_size    = best_grid;
+            res.refined_size = refine_size;
+
+            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+                    __func__, original_width, original_height,
+                    res.overview_size.width, res.overview_size.height,
+                    res.refined_size.width, res.refined_size.height,
+                    res.grid_size.width, res.grid_size.height);
+
+            int width  = refine_size.width;
+            int height = refine_size.height;
+            int grid_x = int(width  / best_grid.width);
+            int grid_y = int(height / best_grid.height);
+            for (int patches_y = 0,                    ic = 0;
+                    patches_y < refine_size.height && ic < best_grid.height;
+                    patches_y += grid_y,              ic += 1) {
+                for (int patches_x = 0,                   jc = 0;
+                        patches_x < refine_size.width && jc < best_grid.width;
+                        patches_x += grid_x,             jc += 1) {
+                    slice_coordinates slice;
+                    slice.x = patches_x;
+                    slice.y = patches_y;
+                    slice.size.width  = grid_x;
+                    slice.size.height = grid_y;
+                    res.slices.push_back(slice);
+                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+                            __func__, (int)res.slices.size() - 1,
+                            slice.x, slice.y, slice.size.width, slice.size.height);
+                }
+            }
+        }
+
+        return res;
+    }
+
+    static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
+        std::vector<clip_image_u8_ptr> output;
+
+        // resize to overview size
+        clip_image_u8_ptr resized_img(clip_image_u8_init());
+        img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview,
+                         inst.padding_overview, inst.pad_color_overview);
+        output.push_back(std::move(resized_img));
+
+        if (inst.slices.empty()) {
+            // no slices, just return the resized image
+            return output;
+        }
+
+        // resize to refined size
+        clip_image_u8_ptr refined_img(clip_image_u8_init());
+        img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined,
+                         inst.padding_refined, inst.pad_color_refined);
+
+        // create slices
+        for (const auto & slice : inst.slices) {
+            int x = slice.x;
+            int y = slice.y;
+            int w = slice.size.width;
+            int h = slice.size.height;
+
+            clip_image_u8_ptr img_slice(clip_image_u8_init());
+            img_tool::crop(*refined_img, *img_slice, x, y, w, h);
+            output.push_back(std::move(img_slice));
+        }
+
+        return output;
+    }
+
+private:
+    static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
+        int width  = original_size.width;
+        int height = original_size.height;
+        if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
+            float r = static_cast<float>(width) / height;
+            height  = static_cast<int>(scale_resolution / std::sqrt(r));
+            width   = static_cast<int>(height * r);
+        }
+        clip_image_size res;
+        res.width  = ensure_divide(width,  patch_size);
+        res.height = ensure_divide(height, patch_size);
+        return res;
+    }
+
+    static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
+        float scale_width  = static_cast<float>(target_max.width)  / orig.width;
+        float scale_height = static_cast<float>(target_max.height) / orig.height;
+        float scale = std::min(scale_width, scale_height);
+        return clip_image_size{
+            static_cast<int>(orig.width  * scale),
+            static_cast<int>(orig.height * scale),
+        };
+    }
+
+    /**
+     * Selects the best resolution from a list of possible resolutions based on the original size.
+     *
+     * For example, when given a list of resolutions:
+     *  - 100x100
+     *  - 200x100
+     *  - 100x200
+     *  - 200x200
+     *
+     * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
+     *
+     * @param original_size The original size of the image
+     * @param possible_resolutions A list of possible resolutions
+     * @return The best fit resolution
+     */
+    static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
+        clip_image_size best_fit;
+        int min_wasted_area = std::numeric_limits<int>::max();
+        int max_effective_resolution = 0;
+
+        for (const clip_image_size & candidate : possible_resolutions) {
+            auto target_size = resize_maintain_aspect_ratio(original_size, candidate);
+            int effective_resolution = std::min(
+                target_size.width * target_size.height,
+                original_size.width * original_size.height);
+            int wasted_area = (candidate.width * candidate.height) - effective_resolution;
+
+            if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
+                max_effective_resolution = effective_resolution;
+                min_wasted_area = wasted_area;
+                best_fit = candidate;
+            }
+
+            LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
+        }
+
+        return best_fit;
+    }
+
+    static int ensure_divide(int length, int patch_size) {
+        return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
+    }
+
+    static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
+        int width  = original_size.width;
+        int height = original_size.height;
+        int grid_x = grid.width;
+        int grid_y = grid.height;
+
+        int refine_width  = ensure_divide(width, grid_x);
+        int refine_height = ensure_divide(height, grid_y);
+
+        clip_image_size grid_size;
+        grid_size.width  = refine_width  / grid_x;
+        grid_size.height = refine_height / grid_y;
+
+        auto best_grid_size  = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale);
+        int best_grid_width  = best_grid_size.width;
+        int best_grid_height = best_grid_size.height;
+
+        clip_image_size refine_size;
+        refine_size.width  = best_grid_width  * grid_x;
+        refine_size.height = best_grid_height * grid_y;
+        return refine_size;
+    }
+
+    static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
+        std::vector<int> candidate_split_grids_nums;
+        for (int i : {multiple - 1, multiple, multiple + 1}) {
+            if (i == 1 || i > max_slice_nums) {
+                continue;
+            }
+            candidate_split_grids_nums.push_back(i);
+        }
+
+        std::vector<clip_image_size> candidate_grids;
+        for (int split_grids_nums : candidate_split_grids_nums) {
+            int m = 1;
+            while (m <= split_grids_nums) {
+                if (split_grids_nums % m == 0) {
+                    candidate_grids.push_back(clip_image_size{m, split_grids_nums / m});
+                }
+                ++m;
+            }
+        }
+
+        clip_image_size best_grid{1, 1};
+        float min_error = std::numeric_limits<float>::infinity();
+        for (const auto& grid : candidate_grids) {
+            float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height));
+            if (error < min_error) {
+                best_grid = grid;
+                min_error = error;
+            }
+        }
+        return best_grid;
+    }
+};
+
+// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
+// some of the logic is similar to llava_uhd, but with different hyperparameters and some logic is unique (e.g. grid layout)
+struct lfm2_vl_image_processor {
+    // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
+    static constexpr int   min_tiles            = 2;
+    static constexpr int   max_tiles            = 10;
+    static constexpr float max_pixels_tolerance = 2.0f;
+    static constexpr int   tile_size            = 512;
+
+    static llava_uhd::slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
+        llava_uhd::slice_instructions inst;
+        const auto & params  = ctx->model.hparams;
+        const int align_size = params.patch_size * params.n_merge;
+
+        inst.interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
+        inst.interpolation_refined  = img_tool::RESIZE_ALGO_BILINEAR;
+        inst.overview_size          = img_tool::calc_size_preserved_ratio(original_size, align_size, params.image_min_pixels, params.image_max_pixels);
+
+        // tile if either dimension exceeds tile_size with tolerance
+        const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance;
+
+        if (!needs_tiling) {
+            inst.refined_size = clip_image_size{0, 0};
+            inst.grid_size    = clip_image_size{0, 0};
+            return inst;
+        }
+
+        const clip_image_size grid = get_grid_layout(original_size.height, original_size.width);
+
+        inst.grid_size    = grid;
+        inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height};
+
+        LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+                __func__,
+                original_size.width, original_size.height,
+                inst.overview_size.width, inst.overview_size.height,
+                inst.refined_size.width, inst.refined_size.height,
+                grid.width, grid.height);
+
+        for (int row = 0; row < grid.height; row++) {
+            for (int col = 0; col < grid.width; col++) {
+                llava_uhd::slice_coordinates slice;
+                slice.x    = col * tile_size;
+                slice.y    = row * tile_size;
+                slice.size = clip_image_size{tile_size, tile_size};
+                inst.slices.push_back(slice);
+                LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n",
+                        __func__, (int)inst.slices.size() - 1,
+                        slice.x, slice.y, slice.size.width, slice.size.height);
+            }
+        }
+
+        return inst;
+    }
+
+private:
+    static clip_image_size find_closest_aspect_ratio(
+            float aspect_ratio,
+            const std::vector<clip_image_size> & target_ratios,
+            int width, int height) {
+        float best_ratio_diff = std::numeric_limits<float>::max();
+        clip_image_size best_ratio = {1, 1};
+        const float area = static_cast<float>(width * height);
+
+        for (const auto & ratio : target_ratios) {
+            const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
+            const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio);
+            if (ratio_diff < best_ratio_diff) {
+                best_ratio_diff = ratio_diff;
+                best_ratio = ratio;
+            } else if (ratio_diff == best_ratio_diff) {
+                const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
+                if (area > 0.5f * target_area) {
+                    best_ratio = ratio;
+                }
+            }
+        }
+        return best_ratio;
+    }
+
+    static std::vector<clip_image_size> get_target_ratios() {
+        std::vector<clip_image_size> ratios;
+        for (int n = min_tiles; n <= max_tiles; n++) {
+            for (int w = 1; w <= n; w++) {
+                for (int h = 1; h <= n; h++) {
+                    if (w * h >= min_tiles && w * h <= max_tiles) {
+                        bool found = false;
+                        for (const auto & r : ratios) {
+                            if (r.width == w && r.height == h) {
+                                found = true;
+                                break;
+                            }
+                        }
+                        if (!found) {
+                            ratios.push_back({w, h});
+                        }
+                    }
+                }
+            }
+        }
+        std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
+            return a.width * a.height < b.width * b.height;
+        });
+        return ratios;
+    }
+
+    static clip_image_size get_grid_layout(int height, int width) {
+        const float aspect_ratio = static_cast<float>(width) / height;
+        const auto ratios = get_target_ratios();
+        return find_closest_aspect_ratio(aspect_ratio, ratios, width, height);
+    }
+};
+
+// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
+// res_imgs memory is being allocated here, previous allocations will be freed if found
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
+    clip_image_size original_size{img->nx, img->ny};
+    auto & params = ctx->model.hparams;
+
+    switch (ctx->proj_type()) {
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+                for (size_t i = 0; i < imgs.size(); ++i) {
+                    // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+                    clip_image_f32_ptr res(clip_image_f32_init());
+                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+                    res_imgs->entries.push_back(std::move(res));
+                }
+
+                res_imgs->grid_x = inst.grid_size.width;
+                res_imgs->grid_y = inst.grid_size.height;
+            } break;
+
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
+            {
+                GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
+                clip_image_u8 resized;
+                const clip_image_size new_size = img_tool::calc_size_preserved_ratio(
+                    original_size,
+                    params.patch_size * 2,
+                    params.image_min_pixels,
+                    params.image_max_pixels);
+                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
+                // clip_image_save_to_bmp(resized, "preproc.bmp");
+                clip_image_f32_ptr img_f32(clip_image_f32_init());
+                // clip_image_f32_ptr res(clip_image_f32_init());
+                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
+                // res_imgs->data[0] = *res;
+                res_imgs->entries.push_back(std::move(img_f32));
+            } break;
+        case PROJECTOR_TYPE_YOUTUVL:
+            {
+                const int patch_size = params.patch_size;  // typically 16
+                const int merge_size = params.n_merge;      // typically 2
+                const int align_size = patch_size * merge_size;  // 32
+
+                const int max_num_patches = params.image_max_pixels > 0 ?
+                    params.image_max_pixels / (patch_size * patch_size) : 256;
+
+                // Linear search for optimal scale to fit within max_num_patches
+                float scale = 1.0f;
+                int target_height = original_size.height;
+                int target_width = original_size.width;
+
+                auto get_scaled_image_size = [align_size](float scale, int size) -> int {
+                    float scaled_size = size * scale;
+                    // Round up to nearest multiple of align_size
+                    int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
+                    // Ensure at least one patch
+                    return std::max(align_size, aligned);
+                };
+
+                // Linear search with 0.02 step size
+                while (scale > 0.0f) {
+                    target_height = get_scaled_image_size(scale, original_size.height);
+                    target_width = get_scaled_image_size(scale, original_size.width);
+
+                    int num_patches_h = target_height / patch_size;
+                    int num_patches_w = target_width / patch_size;
+                    int num_patches = num_patches_h * num_patches_w;
+
+                    if (num_patches > max_num_patches) {
+                        scale -= 0.02f;
+                    } else {
+                        break;
+                    }
+                }
+
+                clip_image_size new_size = {target_width, target_height};
+
+                // Resize the image
+                clip_image_u8 resized;
+                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
+
+                // Normalize to float32
+                clip_image_f32_ptr img_f32(clip_image_f32_init());
+                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
+
+                // Add to results
+                res_imgs->entries.push_back(std::move(img_f32));
+            } break;
+
+        case PROJECTOR_TYPE_IDEFICS3:
+            {
+                // The refined size has two steps:
+                // 1. Resize w/ aspect-ratio preserving such that the longer side is
+                //      the preprocessor longest size
+                // 2. Resize w/out preserving aspect ratio such that both sides are
+                //      multiples of image_size (always rounding up)
+                //
+                // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
+                const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
+                    original_size, params.image_size, params.image_longest_edge);
+                // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
+                //         __func__, original_size.width, original_size.height,
+                //         refined_size.width, refined_size.height);
+
+                llava_uhd::slice_instructions instructions;
+                instructions.overview_size = clip_image_size{params.image_size, params.image_size};
+                instructions.refined_size = refined_size;
+                instructions.grid_size = clip_image_size{
+                    static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
+                    static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
+                };
+                for (int y = 0; y < refined_size.height; y += params.image_size) {
+                    for (int x = 0; x < refined_size.width; x += params.image_size) {
+                        // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
+                        instructions.slices.push_back(llava_uhd::slice_coordinates{
+                            /* x    */x,
+                            /* y    */y,
+                            /* size */clip_image_size{
+                                std::min(params.image_size, refined_size.width - x),
+                                std::min(params.image_size, refined_size.height - y)
+                            }
+                        });
+                    }
+                }
+                auto imgs = llava_uhd::slice_image(img, instructions);
+
+                // cast and normalize to f32
+                for (size_t i = 0; i < imgs.size(); ++i) {
+                    // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+                    clip_image_f32_ptr res(clip_image_f32_init());
+                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+                    res_imgs->entries.push_back(std::move(res));
+                }
+
+                res_imgs->grid_x = instructions.grid_size.width;
+                res_imgs->grid_y = instructions.grid_size.height;
+            } break;
+
+        case PROJECTOR_TYPE_GLM_EDGE:
+        case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
+            {
+                clip_image_u8 resized_image;
+                int sz = params.image_size;
+                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR);
+                clip_image_f32_ptr img_f32(clip_image_f32_init());
+                //clip_image_save_to_bmp(resized_image, "resized.bmp");
+                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+                res_imgs->entries.push_back(std::move(img_f32));
+            } break;
+
+        case PROJECTOR_TYPE_GEMMA3NV:
+            {
+                clip_image_u8 resized_image;
+                int sz = params.image_size;
+                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
+                clip_image_f32_ptr img_f32(clip_image_f32_init());
+                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+                res_imgs->entries.push_back(std::move(img_f32));
+            } break;
+
+        case PROJECTOR_TYPE_JANUS_PRO:
+            {
+                // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
+                const std::array<uint8_t, 3> pad_color = {127, 127, 127};
+                clip_image_u8 resized_image;
+                int sz = params.image_size;
+                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
+                clip_image_f32_ptr img_f32(clip_image_f32_init());
+                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+                res_imgs->entries.push_back(std::move(img_f32));
+            } break;
+
+        case PROJECTOR_TYPE_PIXTRAL:
+        case PROJECTOR_TYPE_LIGHTONOCR:
+            {
+                GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
+                clip_image_u8 resized_image;
+                // the original pixtral model doesn't have n_merge
+                const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge;
+                const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
+                    original_size,
+                    params.patch_size * cur_merge,
+                    params.image_min_pixels,
+                    params.image_max_pixels);
+                img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
+                clip_image_f32_ptr img_f32(clip_image_f32_init());
+                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+                res_imgs->entries.push_back(std::move(img_f32));
+            } break;
+
+        case PROJECTOR_TYPE_LLAMA4:
+            {
+                GGML_ASSERT(!params.image_res_candidates.empty());
+                auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+                for (size_t i = 0; i < imgs.size(); ++i) {
+                    clip_image_f32_ptr res(clip_image_f32_init());
+                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+                    res_imgs->entries.push_back(std::move(res));
+                }
+
+                res_imgs->grid_x = inst.grid_size.width;
+                res_imgs->grid_y = inst.grid_size.height;
+            } break;
+
+        case PROJECTOR_TYPE_LFM2:
+            {
+                auto const inst = lfm2_vl_image_processor::get_slice_instructions(ctx, original_size);
+                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+                for (size_t i = 0; i < imgs.size(); ++i) {
+                    clip_image_f32_ptr res(clip_image_f32_init());
+                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+                    res_imgs->entries.push_back(std::move(res));
+                }
+
+                res_imgs->grid_x = inst.grid_size.width;
+                res_imgs->grid_y = inst.grid_size.height;
+            } break;
+
+        case PROJECTOR_TYPE_KIMIVL:
+            {
+                GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
+                const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
+                    original_size,
+                    params.patch_size * params.n_merge,
+                    params.image_min_pixels,
+                    params.image_max_pixels);
+                const std::array<uint8_t, 3> pad_color = {122, 116, 104};
+
+                clip_image_u8 resized_img;
+                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
+                clip_image_f32_ptr res(clip_image_f32_init());
+                normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
+                res_imgs->entries.push_back(std::move(res));
+            } break;
+
+        case PROJECTOR_TYPE_KIMIK25:
+            {
+                GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
+                const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
+                    original_size,
+                    params.patch_size * params.n_merge,
+                    params.image_min_pixels,
+                    params.image_max_pixels);
+                const std::array<uint8_t, 3> pad_color = {0, 0, 0};
+
+                clip_image_u8 resized_img;
+                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BICUBIC, true, pad_color);
+                clip_image_f32_ptr res(clip_image_f32_init());
+                normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
+                res_imgs->entries.push_back(std::move(res));
+            } break;
+
+        case PROJECTOR_TYPE_MLP:
+        case PROJECTOR_TYPE_MLP_NORM:
+        case PROJECTOR_TYPE_LDP:
+        case PROJECTOR_TYPE_LDPV2:
+        case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm?
+            {
+                // TODO @ngxson : refactor the code below to avoid duplicated logic
+
+                // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
+                // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
+
+                clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
+
+                // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
+                if (params.image_res_candidates.empty()) { // pad_to_square
+                    // for llava-1.5, we resize image to a square, and pad the shorter side with a background color
+                    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
+                    const int longer_side = std::max(img->nx, img->ny);
+                    temp->nx = longer_side;
+                    temp->ny = longer_side;
+                    temp->buf.resize(3 * longer_side * longer_side);
+
+                    // background color in RGB from LLaVA (this is the mean rgb color * 255)
+                    const std::array<uint8_t, 3> pad_color = {122, 116, 104};
+
+                    // resize the image to the target_size
+                    img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
+
+                    clip_image_f32_ptr res(clip_image_f32_init());
+                    normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
+                    res_imgs->entries.push_back(std::move(res));
+
+                } else {
+                    // "spatial_unpad" with "anyres" processing for llava-1.6
+                    auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+                    std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+                    for (size_t i = 0; i < imgs.size(); ++i) {
+                        // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+                        clip_image_f32_ptr res(clip_image_f32_init());
+                        normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+                        res_imgs->entries.push_back(std::move(res));
+                    }
+                }
+            } break;
+
+        default:
+            LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type());
+            return false;
+    }
+
+    return true;
+}
+
+ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
+    return ctx->model.image_newline;
+}
+
+void clip_free(clip_ctx * ctx) {
+    if (ctx == nullptr) {
+        return;
+    }
+    delete ctx;
+}
+
+// deprecated
+size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
+    const int32_t nx = ctx->model.hparams.image_size;
+    const int32_t ny = ctx->model.hparams.image_size;
+    return clip_embd_nbytes_by_img(ctx, nx, ny);
+}
+
+size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
+    clip_image_f32 img;
+    img.nx = img_w;
+    img.ny = img_h;
+    return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
+}
+
+int32_t clip_get_image_size(const struct clip_ctx * ctx) {
+    return ctx->model.hparams.image_size;
+}
+
+int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
+    return ctx->model.hparams.patch_size;
+}
+
+int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
+    return ctx->model.hparams.n_embd;
+}
+
+const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
+    return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
+}
+
+int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+    const auto & params = ctx->model.hparams;
+    const int n_total = clip_n_output_tokens(ctx, img);
+    const auto & proj = ctx->proj_type();
+    switch (proj) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
+        case PROJECTOR_TYPE_YOUTUVL:
+            return (img->nx / params.patch_size) / 2;
+        default:
+            break;
+    }
+    return n_total;
+}
+
+int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+    const auto & params = ctx->model.hparams;
+    const auto & proj = ctx->proj_type();
+    switch (proj) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
+        case PROJECTOR_TYPE_YOUTUVL:
+            return (img->ny / params.patch_size) / 2;
+        default:
+            break;
+    }
+    return 1;
+}
+
+int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+    const auto & params = ctx->model.hparams;
+
+    // for models with fixed size image, the input image is already pre-processed and resized to square
+    int patch_size = params.patch_size;
+    int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
+
+    projector_type proj = ctx->proj_type();
+
+    switch (proj) {
+        case PROJECTOR_TYPE_MLP:
+        case PROJECTOR_TYPE_MLP_NORM:
+        case PROJECTOR_TYPE_JANUS_PRO:
+            {
+                // do nothing
+            } break;
+        case PROJECTOR_TYPE_LDP:
+        case PROJECTOR_TYPE_LDPV2:
+        case PROJECTOR_TYPE_GLM_EDGE:
+            {
+                n_patches /= 4;
+                if (ctx->model.mm_boi) {
+                    n_patches += 2; // for BOI and EOI token embeddings
+                }
+            } break;
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                // Use actual config value if available, otherwise fall back to hardcoded values
+                if (params.minicpmv_query_num > 0) {
+                    n_patches = params.minicpmv_query_num;
+                } else {
+                    // Fallback to hardcoded values for legacy models
+                    if (params.minicpmv_version == 2) {
+                        n_patches = 96;
+                    } else if (params.minicpmv_version == 3) {
+                        n_patches = 64;
+                    } else if (params.minicpmv_version == 4) {
+                        n_patches = 64;
+                    } else if (params.minicpmv_version == 5) {
+                        // MiniCPM-V 4.0
+                        n_patches = 64;
+                    } else if (params.minicpmv_version == 6) {
+                        // MiniCPM-V 4.5
+                        n_patches = 64;
+                    } else if (params.minicpmv_version == 100045) {
+                        // MiniCPM-o 4.5
+                        n_patches = 64;
+                    } else {
+                        GGML_ABORT("Unknown minicpmv version");
+                    }
+                }
+            } break;
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
+        case PROJECTOR_TYPE_YOUTUVL:
+            {
+                // dynamic size (2 conv, so double patch size)
+                int x_patch = img->nx / (params.patch_size * 2);
+                int y_patch = img->ny / (params.patch_size * 2);
+                n_patches = x_patch * y_patch;
+            } break;
+        case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_IDEFICS3:
+        case PROJECTOR_TYPE_INTERNVL:
+        case PROJECTOR_TYPE_LLAMA4:
+            {
+                // both X and Y are downscaled by the scale factor
+                int scale_factor = ctx->model.hparams.n_merge;
+                n_patches /= (scale_factor * scale_factor);
+            } break;
+        case PROJECTOR_TYPE_GEMMA3NV:
+            {
+                // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
+                // regardless of input size (see architecture description)
+                n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
+            } break;
+        case PROJECTOR_TYPE_LFM2:
+        case PROJECTOR_TYPE_KIMIVL:
+        case PROJECTOR_TYPE_KIMIK25:
+            {
+                // dynamic size
+                int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
+                int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
+                int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
+                n_patches = x_patch * y_patch;
+            } break;
+        case PROJECTOR_TYPE_PIXTRAL:
+        case PROJECTOR_TYPE_LIGHTONOCR:
+            {
+                // dynamic size
+                int n_merge = ctx->model.hparams.n_merge;
+                int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
+                int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
+                if (ctx->model.token_embd_img_break) {
+                    n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
+                } else {
+                    n_patches = n_patches_y * n_patches_x;
+                }
+            } break;
+        case PROJECTOR_TYPE_VOXTRAL:
+        case PROJECTOR_TYPE_ULTRAVOX:
+        case PROJECTOR_TYPE_QWEN2A:
+        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+            {
+                n_patches = img->nx;
+
+                const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
+                if (ctx->model.audio_has_stack_frames()) {
+                    GGML_ASSERT(proj_stack_factor > 0);
+                    const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor);
+                    n_patches = n_len / proj_stack_factor;
+                }
+
+                // whisper downscales input token by half after conv1d
+                n_patches /= 2;
+
+                if (ctx->model.audio_has_avgpool()) {
+                    // divide by 2 because of nn.AvgPool1d(2, stride=2)
+                    n_patches /= 2;
+                }
+            } break;
+        case PROJECTOR_TYPE_GLMA:
+            {
+                n_patches = img->nx;
+                // whisper downscales input token by half after conv1d
+                n_patches /= 2;
+                // reshape by merge_factor
+                n_patches /= ctx->model.hparams.proj_stack_factor;
+                // for BOI and EOI token embeddings
+                n_patches += 2;
+            } break;
+        case PROJECTOR_TYPE_COGVLM:
+            {
+                n_patches += 2; // for BOI and EOI token embeddings
+            } break;
+        case PROJECTOR_TYPE_LFM2A:
+            {
+                n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
+            } break;
+        default:
+            GGML_ABORT("unsupported projector type");
+    }
+
+    return n_patches;
+}
+
+bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
+    clip_image_f32_batch imgs;
+    clip_image_f32_ptr img_copy(clip_image_f32_init());
+    *img_copy = *img;
+    imgs.entries.push_back(std::move(img_copy));
+
+    return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
+}
+
+bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
+    const clip_image_f32_batch & imgs = *imgs_c_ptr;
+    int batch_size = imgs.entries.size();
+
+    // TODO @ngxson : implement batch size > 1 as a loop
+    //                we don't need true batching support because the cgraph will gonna be big anyway
+    if (batch_size != 1) {
+        return false; // only support batch size of 1
+    }
+
+    // if buffers are not allocated, we need to do a warmup run to allocate them
+    if (!ctx->is_allocated) {
+        clip_model_loader::warmup(*ctx, *imgs_c_ptr);
+    }
+
+    // build the inference graph
+    ggml_backend_sched_reset(ctx->sched.get());
+    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
+    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
+
+    // set inputs
+    const auto & model   = ctx->model;
+    const auto & hparams = model.hparams;
+
+    const int image_size_width  = imgs.entries[0]->nx;
+    const int image_size_height = imgs.entries[0]->ny;
+
+    const int patch_size    = hparams.patch_size;
+    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
+    const int pos_w = image_size_width  / patch_size;
+    const int pos_h = image_size_height / patch_size;
+
+
+    auto get_inp_tensor = [&gf](const char * name) {
+        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
+        if (inp == nullptr) {
+            GGML_ABORT("Failed to get tensor %s", name);
+        }
+        if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
+            GGML_ABORT("Tensor %s is not an input tensor", name);
+        }
+        return inp;
+    };
+
+    auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
+        ggml_tensor * cur = get_inp_tensor(name);
+        GGML_ASSERT(cur->type == GGML_TYPE_F32);
+        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+    };
+
+    auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
+        ggml_tensor * cur = get_inp_tensor(name);
+        GGML_ASSERT(cur->type == GGML_TYPE_I32);
+        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+    };
+
+    // set input pixel values
+    if (!imgs.is_audio) {
+        size_t nelem = 0;
+        for (const auto & img : imgs.entries) {
+            nelem += img->nx * img->ny * 3;
+        }
+        std::vector<float> inp_raw(nelem);
+
+        // layout of data (note: the channel dim is unrolled to better visualize the layout):
+        //
+        // ┌──W──┐
+        // │     H │  channel = R
+        // ├─────┤ │
+        // │     H │  channel = G
+        // ├─────┤ │
+        // │     H │  channel = B
+        // └─────┘ │
+        //   ──────┘ x B
+
+        for (size_t i = 0; i < imgs.entries.size(); i++) {
+            const int nx = imgs.entries[i]->nx;
+            const int ny = imgs.entries[i]->ny;
+            const int n = nx * ny;
+
+            for (int b = 0; b < batch_size; b++) {
+                float * batch_entry = inp_raw.data() + b * (3*n);
+                for (int y = 0; y < ny; y++) {
+                    for (int x = 0; x < nx; x++) {
+                        size_t base_src = 3*(y * nx + x); // idx of the first channel
+                        size_t base_dst =    y * nx + x;  // idx of the first channel
+                        batch_entry[      base_dst] = imgs.entries[b]->buf[base_src    ];
+                        batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
+                        batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
+                    }
+                }
+            }
+        }
+        set_input_f32("inp_raw", inp_raw);
+
+    } else {
+        // audio input
+        GGML_ASSERT(imgs.entries.size() == 1);
+        const auto & mel_inp = imgs.entries[0];
+        const int n_step = mel_inp->nx;
+        const int n_mel  = mel_inp->ny;
+        std::vector<float> inp_raw(n_step * n_mel);
+        std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
+        set_input_f32("inp_raw", inp_raw);
+    }
+
+    // set input per projector
+    switch (ctx->model.proj_type) {
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                // inspired from siglip:
+                //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
+                //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
+                std::vector<int32_t> positions(pos_h * pos_w);
+                int bucket_coords_h[1024];
+                int bucket_coords_w[1024];
+                for (int i = 0; i < pos_h; i++){
+                    bucket_coords_h[i] = std::floor(70.0*i/pos_h);
+                }
+                for (int i = 0; i < pos_w; i++){
+                    bucket_coords_w[i] = std::floor(70.0*i/pos_w);
+                }
+                for (int i = 0, id = 0; i < pos_h; i++){
+                    for (int j = 0; j < pos_w; j++){
+                        positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
+                    }
+                }
+                set_input_i32("positions", positions);
+
+                // inputs for resampler projector
+                // set the 2D positions (using float for sinusoidal embedding)
+                int n_patches_per_col = image_size_width / patch_size;
+                std::vector<float> pos_data(n_pos);
+                // dimension H
+                for (int i = 0; i < n_pos; i++) {
+                    pos_data[i] = static_cast<float>(i / n_patches_per_col);
+                }
+                set_input_f32("pos_h", pos_data);
+                // dimension W
+                for (int i = 0; i < n_pos; i++) {
+                    pos_data[i] = static_cast<float>(i % n_patches_per_col);
+                }
+                set_input_f32("pos_w", pos_data);
+                // base frequency omega
+                const float base_freq   = 10000.0f;
+                const int   n_embd_proj = clip_n_mmproj_embd(ctx);
+                std::vector<float> omega(n_embd_proj / 4);
+                for (int i = 0; i < n_embd_proj / 4; ++i) {
+                    omega[i] = 1.0f / std::pow(base_freq, static_cast<float>(i) / (n_embd_proj / 4));
+                }
+                set_input_f32("omega", omega);
+            } break;
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
+            {
+                const int merge_ratio = hparams.n_merge;
+                const int pw = image_size_width  / patch_size;
+                const int ph = image_size_height / patch_size;
+                std::vector<int> positions(n_pos * 4);
+                int ptr = 0;
+                for (int y = 0; y < ph; y += merge_ratio) {
+                    for (int x = 0; x < pw; x += merge_ratio) {
+                        for (int dy = 0; dy < 2; dy++) {
+                            for (int dx = 0; dx < 2; dx++) {
+                                positions[                  ptr] = y + dy;
+                                positions[    num_patches + ptr] = x + dx;
+                                positions[2 * num_patches + ptr] = y + dy;
+                                positions[3 * num_patches + ptr] = x + dx;
+                                ptr++;
+                            }
+                        }
+                    }
+                }
+
+                set_input_i32("positions", positions);
+            } break;
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_YOUTUVL:
+            {
+                // pw * ph = number of tokens output by ViT after apply patch merger
+                // ipw * ipw = number of vision token been processed inside ViT
+                const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
+                const int merge_ratio = 2;
+                const int pw  = image_size_width  / patch_size / merge_ratio;
+                const int ph  = image_size_height / patch_size / merge_ratio;
+                const int ipw = image_size_width  / patch_size;
+                const int iph = image_size_height / patch_size;
+
+                std::vector<int> idx    (ph * pw);
+                std::vector<int> inv_idx(ph * pw);
+
+                if (use_window_attn) {
+                    const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
+                    const int grid_window = attn_window_size / patch_size / merge_ratio;
+                    int dst = 0;
+                    // [num_vision_tokens, num_vision_tokens] attention mask tensor
+                    std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
+                    int mask_row = 0;
+
+                    for (int y = 0; y < ph; y += grid_window) {
+                        for (int x = 0; x < pw; x += grid_window) {
+                            const int win_h = std::min(grid_window, ph - y);
+                            const int win_w = std::min(grid_window, pw - x);
+                            const int dst_0 = dst;
+                            // group all tokens belong to the same window togather (to a continue range)
+                            for (int dy = 0; dy < win_h; dy++) {
+                                for (int dx = 0; dx < win_w; dx++) {
+                                    const int src = (y + dy) * pw + (x + dx);
+                                    GGML_ASSERT(src < (int)idx.size());
+                                    GGML_ASSERT(dst < (int)inv_idx.size());
+                                    idx    [src] = dst;
+                                    inv_idx[dst] = src;
+                                    dst++;
+                                }
+                            }
+
+                            for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
+                                int row_offset = mask_row * (ipw * iph);
+                                std::fill(
+                                    mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
+                                    mask.begin() + row_offset + (dst   * merge_ratio * merge_ratio),
+                                    0.0);
+                                mask_row++;
+                            }
+                        }
+                    }
+
+                    set_input_i32("window_idx",     idx);
+                    set_input_i32("inv_window_idx", inv_idx);
+                    set_input_f32("window_mask",    mask);
+                } else {
+                    for (int i = 0; i < ph * pw; i++) {
+                        idx[i] = i;
+                    }
+                }
+
+                const int mpow = merge_ratio * merge_ratio;
+                std::vector<int> positions(n_pos * 4);
+
+                int ptr = 0;
+                for (int y = 0; y < iph; y += merge_ratio) {
+                    for (int x = 0; x < ipw; x += merge_ratio) {
+                        for (int dy = 0; dy < 2; dy++) {
+                            for (int dx = 0; dx < 2; dx++) {
+                                auto remap = idx[ptr / mpow];
+                                remap = (remap * mpow) + (ptr % mpow);
+
+                                positions[                  remap] = y + dy;
+                                positions[    num_patches + remap] = x + dx;
+                                positions[2 * num_patches + remap] = y + dy;
+                                positions[3 * num_patches + remap] = x + dx;
+                                ptr++;
+                            }
+                        }
+                    }
+                }
+
+                set_input_i32("positions", positions);
+            } break;
+        case PROJECTOR_TYPE_PIXTRAL:
+        case PROJECTOR_TYPE_KIMIVL:
+        case PROJECTOR_TYPE_KIMIK25:
+        case PROJECTOR_TYPE_LIGHTONOCR:
+            {
+                // set the 2D positions
+                int n_patches_per_col = image_size_width / patch_size;
+                std::vector<int> pos_data(n_pos);
+                // dimension H
+                for (int i = 0; i < n_pos; i++) {
+                    pos_data[i] = i / n_patches_per_col;
+                }
+                set_input_i32("pos_h", pos_data);
+                // dimension W
+                for (int i = 0; i < n_pos; i++) {
+                    pos_data[i] = i % n_patches_per_col;
+                }
+                set_input_i32("pos_w", pos_data);
+            } break;
+        case PROJECTOR_TYPE_GLM_EDGE:
+        {
+            // llava and other models
+            std::vector<int32_t> positions(n_pos);
+            for (int i = 0; i < n_pos; i++) {
+                positions[i] = i;
+            }
+            set_input_i32("positions", positions);
+        } break;
+        case PROJECTOR_TYPE_MLP:
+        case PROJECTOR_TYPE_MLP_NORM:
+        case PROJECTOR_TYPE_LDP:
+        case PROJECTOR_TYPE_LDPV2:
+            {
+                // llava and other models
+                std::vector<int32_t> positions(n_pos);
+                for (int i = 0; i < n_pos; i++) {
+                    positions[i] = i;
+                }
+                set_input_i32("positions", positions);
+
+                // The patches vector is used to get rows to index into the embeds with;
+                // we should skip dim 0 only if we have CLS to avoid going out of bounds
+                // when retrieving the rows.
+                int patch_offset = model.class_embedding ? 1 : 0;
+                std::vector<int32_t> patches(num_patches);
+                for (int i = 0; i < num_patches; i++) {
+                    patches[i] = i + patch_offset;
+                }
+                set_input_i32("patches", patches);
+            } break;
+        case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_GEMMA3NV:
+        case PROJECTOR_TYPE_IDEFICS3:
+        case PROJECTOR_TYPE_INTERNVL:
+        case PROJECTOR_TYPE_QWEN2A:
+        case PROJECTOR_TYPE_GLMA:
+        case PROJECTOR_TYPE_ULTRAVOX:
+        case PROJECTOR_TYPE_LFM2:
+        case PROJECTOR_TYPE_VOXTRAL:
+        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+        case PROJECTOR_TYPE_JANUS_PRO:
+        case PROJECTOR_TYPE_COGVLM:
+            {
+                // do nothing
+            } break;
+        case PROJECTOR_TYPE_LLAMA4:
+            {
+                // set the 2D positions
+                int n_patches_per_col = image_size_width / patch_size;
+                std::vector<int> pos_data(num_patches + 1, 0); // +1 for the [CLS] token
+                // last pos is always kept 0, it's for CLS
+                // dimension H
+                for (int i = 0; i < num_patches; i++) {
+                    pos_data[i] = (i / n_patches_per_col) + 1;
+                }
+                set_input_i32("pos_h", pos_data);
+                // dimension W
+                for (int i = 0; i < num_patches; i++) {
+                    pos_data[i] = (i % n_patches_per_col) + 1;
+                }
+                set_input_i32("pos_w", pos_data);
+            } break;
+        case PROJECTOR_TYPE_LFM2A:
+            {
+                GGML_ASSERT(imgs.entries.size() == 1);
+                const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
+
+                auto d_model = 512;
+                auto seq_len = n_frames * 2 - 1;
+                std::vector<float> pos_emb(d_model*seq_len);
+                std::vector<double> inv_freq(d_model / 2);
+                for (size_t i = 0; i < inv_freq.size(); ++i) {
+                    inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i)));
+                }
+                for (int64_t pos = 0; pos < seq_len; ++pos) {
+                    for (size_t i = 0; i < inv_freq.size(); ++i) {
+                        const float ang = (n_frames - pos - 1) * inv_freq[i];
+                        pos_emb[pos*d_model + 2*i + 0] = sinf(ang);  // even
+                        pos_emb[pos*d_model + 2*i + 1] = cosf(ang);  // odd
+                    }
+                }
+                set_input_f32("pos_emb", pos_emb);
+            } break;
+        default:
+            GGML_ABORT("Unknown projector type");
+    }
+
+    // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
+    ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu);
+    ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+    if (reg) {
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
+            ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
+        }
+    }
+
+    auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
+    if (status != GGML_STATUS_SUCCESS) {
+        LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
+        return false;
+    }
+
+    // the last node is the embedding tensor
+    ggml_tensor * embeddings = ggml_graph_node(gf, -1);
+
+    // sanity check (only support batch size of 1 for now)
+    const int n_tokens_out = embeddings->ne[1];
+    const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
+    if (n_tokens_out != expected_n_tokens_out) {
+        LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
+        GGML_ABORT("Invalid number of output tokens");
+    }
+
+    // copy the embeddings to the location passed by the user
+    if (vec != nullptr) {
+        ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+    }
+
+    // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
+    if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) {
+        const int64_t n_embd = embeddings->ne[0];
+        const int64_t n_tokens = embeddings->ne[1];
+        std::vector<float> emb_data(n_embd * n_tokens);
+        ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings));
+
+        LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n");
+        LOG_INF("Shape: [%lld, %lld]\n", (long long)n_embd, (long long)n_tokens);
+
+        // Print first few values of first token
+        LOG_INF("Token 0 (first 16 values): ");
+        for (int i = 0; i < std::min((int64_t)16, n_embd); i++) {
+            LOG_INF("%.6f ", emb_data[i]);
+        }
+        LOG_INF("\n");
+
+        // Print last few values of first token
+        if (n_embd > 16) {
+            LOG_INF("Token 0 (last 16 values):  ");
+            for (int64_t i = n_embd - 16; i < n_embd; i++) {
+                LOG_INF("%.6f ", emb_data[i]);
+            }
+            LOG_INF("\n");
+        }
+
+        // Compute and print statistics
+        float sum = 0.0f, sum_sq = 0.0f, min_val = emb_data[0], max_val = emb_data[0];
+        for (size_t i = 0; i < emb_data.size(); i++) {
+            sum += emb_data[i];
+            sum_sq += emb_data[i] * emb_data[i];
+            min_val = std::min(min_val, emb_data[i]);
+            max_val = std::max(max_val, emb_data[i]);
+        }
+        float mean = sum / emb_data.size();
+        float variance = (sum_sq / emb_data.size()) - (mean * mean);
+        LOG_INF("Stats: mean=%.6f, std=%.6f, min=%.6f, max=%.6f, sum=%.6f\n",
+                mean, sqrtf(variance), min_val, max_val, sum);
+        LOG_INF("=== END MTMD_DEBUG_EMBEDDINGS ===\n\n");
+    }
+
+    return true;
+}
+
+int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
+    switch (ctx->model.proj_type) {
+        case PROJECTOR_TYPE_LDP:
+            return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
+        case PROJECTOR_TYPE_LDPV2:
+            return ctx->model.mm_model_peg_0_b->ne[0];
+        case PROJECTOR_TYPE_MLP:
+        case PROJECTOR_TYPE_PIXTRAL:
+        case PROJECTOR_TYPE_LIGHTONOCR:
+            return ctx->model.mm_2_w->ne[1];
+        case PROJECTOR_TYPE_MLP_NORM:
+            return ctx->model.mm_3_b->ne[0];
+        case PROJECTOR_TYPE_MINICPMV:
+            return ctx->model.mm_model_proj->ne[0];
+        case PROJECTOR_TYPE_GLM_EDGE:
+            return ctx->model.mm_model_mlp_3_w->ne[1];
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_JANUS_PRO:
+        case PROJECTOR_TYPE_YOUTUVL:
+            return ctx->model.mm_1_b->ne[0];
+        case PROJECTOR_TYPE_QWEN3VL:
+            // main path + deepstack paths
+            return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
+        case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_GEMMA3NV:
+            return ctx->model.mm_input_proj_w->ne[0];
+        case PROJECTOR_TYPE_IDEFICS3:
+            return ctx->model.projection->ne[1];
+        case PROJECTOR_TYPE_ULTRAVOX:
+        case PROJECTOR_TYPE_VOXTRAL:
+        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+            return ctx->model.mm_2_w->ne[1];
+        case PROJECTOR_TYPE_INTERNVL:
+            return ctx->model.mm_3_w->ne[1];
+        case PROJECTOR_TYPE_LLAMA4:
+            return ctx->model.mm_model_proj->ne[1];
+        case PROJECTOR_TYPE_QWEN2A:
+            return ctx->model.mm_fc_w->ne[1];
+        case PROJECTOR_TYPE_GLMA:
+            return ctx->model.mm_2_w->ne[1];
+        case PROJECTOR_TYPE_LFM2:
+        case PROJECTOR_TYPE_KIMIVL:
+        case PROJECTOR_TYPE_KIMIK25:
+            return ctx->model.mm_2_w->ne[1];
+        case PROJECTOR_TYPE_COGVLM:
+            return ctx->model.mm_4h_to_h_w->ne[1];
+        case PROJECTOR_TYPE_LFM2A:
+            return ctx->model.position_embeddings->ne[0];
+        case PROJECTOR_TYPE_GLM4V:
+            return ctx->model.mm_ffn_down_w->ne[1];
+        default:
+            GGML_ABORT("Unknown projector type");
+    }
+}
+
+int clip_is_minicpmv(const struct clip_ctx * ctx) {
+    // TODO: remove this function
+    if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
+        return ctx->model.hparams.minicpmv_version;
+    }
+    return 0;
+}
+
+bool clip_is_glm(const struct clip_ctx * ctx) {
+    // TODO: remove this function
+    return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
+}
+
+bool clip_is_llava(const struct clip_ctx * ctx) {
+    return ctx->model.hparams.has_llava_projector;
+}
+
+bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
+    return ctx->model.modality == CLIP_MODALITY_VISION;
+}
+
+bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
+    return ctx->model.modality == CLIP_MODALITY_AUDIO;
+}
+
+bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
+    switch (ctx->proj_type()) {
+        case PROJECTOR_TYPE_ULTRAVOX:
+        case PROJECTOR_TYPE_QWEN2A:
+        case PROJECTOR_TYPE_GLMA:
+        case PROJECTOR_TYPE_VOXTRAL:
+        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
+    clip_image_f32 clip_img;
+    clip_img.buf.resize(h * w * 3);
+    for (int i = 0; i < h*w*3; i++)
+    {
+        clip_img.buf[i] = img[i];
+    }
+    clip_img.nx = w;
+    clip_img.ny = h;
+    clip_image_encode(ctx, n_threads, &clip_img, vec);
+    return true;
+}
+
+//
+// API used internally with mtmd
+//
+
+projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
+    return ctx->proj_type();
+}
+
+void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
+    clip_image_f32 * audio = new clip_image_f32;
+    audio->nx = n_frames;
+    audio->ny = n_mel;
+    audio->buf.resize(n_frames * n_mel);
+    std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
+
+    batch->entries.push_back(clip_image_f32_ptr(audio));
+    batch->is_audio = true;
+}
+
+const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
+    return &ctx->model.hparams;
+}
+
+//
+// API for debugging
+//
+void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
+    clip_image_f32 img;
+    img.nx = w;
+    img.ny = h;
+    img.buf.resize(h * w * 3);
+    for (int i = 0; i < h * w * 3; i++) {
+        img.buf[i] = static_cast<float>(fill_value);
+    }
+    clip_image_encode(ctx, 1, &img, nullptr);
+    GGML_ASSERT(img.buf.empty() && "expected, always stop here");
+}
diff --git a/llama.cpp/tools/mtmd/clip.h b/llama.cpp/tools/mtmd/clip.h
new file mode 100644
index 0000000..71b5848
--- /dev/null
+++ b/llama.cpp/tools/mtmd/clip.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include "ggml.h"
+#include "mtmd.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+// !!! Internal header, to be used by mtmd only !!!
+
+#define MTMD_INTERNAL_HEADER
+
+struct clip_ctx;
+
+struct clip_image_size {
+    int width;
+    int height;
+};
+
+struct clip_image_f32;
+struct clip_image_u8_batch;
+struct clip_image_f32_batch;
+
+enum clip_modality {
+    CLIP_MODALITY_VISION,
+    CLIP_MODALITY_AUDIO,
+};
+
+enum clip_flash_attn_type {
+    CLIP_FLASH_ATTN_TYPE_AUTO     = -1,
+    CLIP_FLASH_ATTN_TYPE_DISABLED = 0,
+    CLIP_FLASH_ATTN_TYPE_ENABLED  = 1,
+};
+
+struct clip_context_params {
+    bool use_gpu;
+    enum clip_flash_attn_type flash_attn_type;
+    int image_min_tokens;
+    int image_max_tokens;
+    bool warmup;
+    ggml_backend_sched_eval_callback cb_eval;
+    void * cb_eval_user_data;
+};
+
+struct clip_init_result {
+    struct clip_ctx * ctx_v; // vision context
+    struct clip_ctx * ctx_a; // audio context
+};
+
+struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
+
+void clip_free(struct clip_ctx * ctx);
+
+size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
+
+int32_t clip_get_image_size (const struct clip_ctx * ctx);
+int32_t clip_get_patch_size (const struct clip_ctx * ctx);
+int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
+
+// TODO: should be enum, not string
+const char * clip_patch_merge_type(const struct clip_ctx * ctx);
+
+int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+
+// for M-RoPE, this will be the number of token positions in X and Y directions
+// for other models, X will be the total number of tokens and Y will be 1
+int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+
+// this should be equal to the embedding dimension of the text model
+int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+
+struct clip_image_size      * clip_image_size_init(void);
+struct clip_image_u8        * clip_image_u8_init (void);
+struct clip_image_f32       * clip_image_f32_init(void);
+struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
+
+// nx, ny are the output image dimensions
+unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
+
+void clip_image_size_free (struct clip_image_size * img_size);
+void clip_image_u8_free (struct clip_image_u8  * img);
+void clip_image_f32_free(struct clip_image_f32 * img);
+void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+
+// use for accessing underlay data of clip_image_f32_batch
+size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
+size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
+size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
+struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
+
+/**
+ * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
+ * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
+ */
+void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
+
+/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
+bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
+
+struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
+
+bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
+bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+
+int clip_is_minicpmv(const struct clip_ctx * ctx);
+bool clip_is_glm(const struct clip_ctx * ctx);
+bool clip_is_llava(const struct clip_ctx * ctx);
+// note for contributor: this clip_is_(model) pattern is deprecated
+//                       do NOT add new functions like this
+
+bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
+
+// use by audio input
+void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel);
+
+bool clip_has_vision_encoder(const struct clip_ctx * ctx);
+bool clip_has_audio_encoder(const struct clip_ctx * ctx);
+bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
diff --git a/llama.cpp/tools/mtmd/deprecation-warning.cpp b/llama.cpp/tools/mtmd/deprecation-warning.cpp
new file mode 100644
index 0000000..dded0a5
--- /dev/null
+++ b/llama.cpp/tools/mtmd/deprecation-warning.cpp
@@ -0,0 +1,22 @@
+#include <cstdio>
+#include <string>
+
+int main(int argc, char** argv) {
+    std::string filename = "main";
+    if (argc >= 1) {
+        filename = argv[0];
+    }
+
+    // Get only the program name from the full path
+    size_t pos = filename.find_last_of("/\\");
+    if (pos != std::string::npos) {
+        filename = filename.substr(pos+1);
+    }
+
+    fprintf(stdout, "\n");
+    fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
+    fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n");
+    fprintf(stdout, "\n");
+
+    return EXIT_FAILURE;
+}
diff --git a/llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py b/llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
new file mode 100644
index 0000000..2949fae
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
@@ -0,0 +1,412 @@
+import argparse
+import os
+import json
+import re
+
+import torch
+import numpy as np
+from gguf import *
+from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+
+    if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
+        return True
+
+    if name.startswith("v") and not has_vision:
+        return True
+
+    if name.startswith("t") and not has_text:
+        return True
+
+    return False
+
+
+def get_tensor_name(name: str) -> str:
+    # Standardize the transformers llava next keys for
+    # image newline / mm projector with the classes in haotian-liu LLaVA
+    if name == "image_newline":
+        return "model.image_newline"
+    if name.startswith("multi_modal_projector"):
+        name = name.replace("multi_modal_projector", "mm")
+        if "linear_1" in name:
+            name = name.replace("linear_1", "0")
+        if "linear_2" in name:
+            name = name.replace("linear_2", "2")
+        return name
+
+    if "projection" in name:
+        return name
+    if "mm_projector" in name:
+        name = name.replace("model.mm_projector", "mm")
+        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+        return name
+
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine")
+ap.add_argument("--text-only", action="store_true", required=False,
+                help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+                help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
+                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+
+# Selectable visual encoders that are compatible with this script
+encoder_group = ap.add_mutually_exclusive_group()
+encoder_group.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+                help="The clip model is from openclip (for ViT-SO400M type))")
+encoder_group.add_argument("--clip-model-is-siglip", action="store_true", required=False,
+                help="the visual encoder is Siglip.")
+
+ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
+default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+default_image_std = [0.26862954, 0.26130258, 0.27577711]
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+
+# with proper
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
+    exit(1)
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+if (
+    args.clip_model_is_vision or
+    not os.path.exists(dir_model + "/vocab.json") or
+    args.clip_model_is_openclip or
+    args.clip_model_is_siglip
+):
+    vocab = None
+    tokens = None
+else:
+    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+        vocab = json.load(f)
+        tokens = [key for key in vocab]
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    config = json.load(f)
+    if args.clip_model_is_vision:
+        v_hparams = config
+        t_hparams = None
+    else:
+        v_hparams = config["vision_config"]
+        t_hparams = config["text_config"]
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+if args.clip_model_is_siglip:
+    model = SiglipVisionModel.from_pretrained(dir_model)
+    processor = None
+elif args.clip_model_is_vision or args.clip_model_is_openclip:
+    model = CLIPVisionModel.from_pretrained(dir_model)
+    processor = None
+else:
+    model = CLIPModel.from_pretrained(dir_model)
+    processor = CLIPProcessor.from_pretrained(dir_model)
+
+fname_middle = None
+has_text_encoder = True
+has_vision_encoder = True
+has_llava_projector = False
+if args.text_only:
+    fname_middle = "text-"
+    has_vision_encoder = False
+elif args.llava_projector is not None:
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_llava_projector = True
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
+else:
+    fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG)
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_llava_projector", has_llava_projector)
+fout.add_file_type(ftype)
+model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
+fout.add_name(model_name)
+if args.text_only:
+    fout.add_description("text-only CLIP model")
+elif args.vision_only and not has_llava_projector:
+    fout.add_description("vision-only CLIP model")
+elif has_llava_projector:
+    fout.add_description("image encoder for LLaVA")
+    # add projector type
+    fout.add_string("clip.projector_type", args.projector_type)
+else:
+    fout.add_description("two-tower CLIP model")
+
+if has_text_encoder:
+    assert t_hparams is not None
+    assert tokens is not None
+    if args.clip_model_is_siglip:
+        text_projection_dim = 0
+    else:
+        text_projection_dim = t_hparams.get("projection_dim", config["projection_dim"])
+    # text_model hparams
+    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
+    fout.add_uint32("clip.text.projection_dim", text_projection_dim)
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
+    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
+    fout.add_token_list(tokens)
+
+
+
+def get_non_negative_vision_feature_layers(v_hparams):
+    """
+    Determine the vision feature layer(s) for the llava model, which are indices into the
+    hidden states of the visual encoder. Note that the hidden states array generally takes the
+    form:
+
+        [<emb input>, <output of enc block 0>, ... <output of enc block num_hidden_layers>]
+
+    so feature indices should be offset as n+1 to get the output of encoder block n.
+    We convert all vision feature layers to non-negative so that -1 can be used in
+    the model as an unset value. If no vision feature layer is found, we leave it unset.
+    """
+    num_hidden_layers = v_hparams["num_hidden_layers"]
+    to_non_negative = lambda layer_idx: layer_idx  if layer_idx >= 0 else num_hidden_layers + layer_idx + 1
+    feature_layers_key = None
+    # Key used for llava models in transformers
+    if "vision_feature_layer" in config:
+        feature_layers_key = "vision_feature_layer"
+    # Key used for llava models in the original format
+    elif "mm_vision_select_layer" in config:
+        feature_layers_key = "mm_vision_select_layer"
+    if feature_layers_key is not None:
+        feature_layers = config[feature_layers_key]
+        if isinstance(feature_layers, int):
+            feature_layers = [feature_layers]
+        return [to_non_negative(feature_layer) for feature_layer in feature_layers]
+
+# Determine if we have explicitly specified vision feature layers in our config
+feature_layers = get_non_negative_vision_feature_layers(v_hparams)
+
+if has_vision_encoder:
+    # Siglip does not have a visual projector; set projection dim to 0
+    if args.clip_model_is_siglip:
+        visual_projection_dim = 0
+    else:
+        visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"])
+
+    # set vision_model hparams
+    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
+    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
+    fout.add_uint32("clip.vision.projection_dim", visual_projection_dim)
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
+    if feature_layers:
+        block_count = max(feature_layers)
+    else:
+        block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+                            #     /**
+                            #      "image_grid_pinpoints": [
+                            #         [
+                            #         336,
+                            #         672
+                            #         ],
+                            #         [
+                            #         672,
+                            #         336
+                            #         ],
+                            #         [
+                            #         672,
+                            #         672
+                            #         ],
+                            #         [
+                            #         1008,
+                            #         336
+                            #         ],
+                            #         [
+                            #         336,
+                            #         1008
+                            #         ]
+                            #     ],
+                            #     Flattened:
+                            #     [
+                            #         336, 672,
+                            #         672, 336,
+                            #         672, 672,
+                            #         1008, 336,
+                            #         336, 1008
+                            #     ]
+                            #  *
+                            #  */
+    if "image_grid_pinpoints" in v_hparams:
+        # flatten it
+        image_grid_pinpoints = []
+        for pinpoint in v_hparams["image_grid_pinpoints"]:
+            for p in pinpoint:
+                image_grid_pinpoints.append(p)
+        fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
+    if "image_crop_resolution" in v_hparams:
+        fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
+    if "image_aspect_ratio" in v_hparams:
+        fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
+    if "image_split_resolution" in v_hparams:
+        fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
+    if "mm_patch_merge_type" in v_hparams:
+        fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
+    if "mm_projector_type" in v_hparams:
+        fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
+    if feature_layers:
+        fout.add_array("clip.vision.feature_layer", feature_layers)
+
+    if processor is not None:
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean  # pyright: ignore[reportAttributeAccessIssue]
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std  # pyright: ignore[reportAttributeAccessIssue]
+    else:
+        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+        image_std = args.image_std if args.image_std is not None else default_image_std
+    fout.add_array("clip.vision.image_mean", image_mean)
+    fout.add_array("clip.vision.image_std", image_std)
+
+use_gelu = v_hparams["hidden_act"] == "gelu"
+fout.add_bool("clip.use_gelu", use_gelu)
+
+
+if has_llava_projector:
+    # By default, we drop the last layer for llava projector
+    # models unless we have explicitly set vision feature layers
+    if feature_layers is None:
+        model.vision_model.encoder.layers.pop(-1)
+    else:
+        model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
+
+    projector = torch.load(args.llava_projector)
+    for name, data in projector.items():
+        name = get_tensor_name(name)
+        # pw and dw conv ndim==4
+        if data.ndim == 2 or data.ndim == 4:
+            data = data.squeeze().numpy().astype(np.float16)
+        else:
+            data = data.squeeze().numpy().astype(np.float32)
+
+        fout.add_tensor(name, data)
+
+    print("Projector tensors added\n")
+
+state_dict = model.state_dict()
+for name, data in state_dict.items():
+    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    name = get_tensor_name(name)
+    data = data.squeeze().numpy()
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
diff --git a/llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py b/llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py
new file mode 100644
index 0000000..848ef1c
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py
@@ -0,0 +1,280 @@
+import argparse
+import os
+import json
+import re
+
+import torch
+import numpy as np
+from gguf import *
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+from transformers import SiglipVisionModel, SiglipVisionConfig
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+
+    if name in (
+        "vision_model.head.probe",
+        "vision_model.head.attention.in_proj_weight",
+        "vision_model.head.attention.in_proj_bias",
+        "vision_model.head.attention.out_proj.weight",
+        "vision_model.head.attention.out_proj.bias",
+        "vision_model.head.layernorm.weight",
+        "vision_model.head.layernorm.bias",
+        "vision_model.head.mlp.fc1.weight",
+        "vision_model.head.mlp.fc1.bias",
+        "vision_model.head.mlp.fc2.weight",
+        "vision_model.head.mlp.fc2.bias"
+    ):
+        return True
+
+    if name.startswith("v") and not has_vision:
+        return True
+
+    if name.startswith("t") and not has_text:
+        return True
+
+    return False
+
+
+def get_tensor_name(name: str) -> str:
+    if "projection" in name:
+        return name
+    if "mm_projector" in name:
+        name = name.replace("model.mm_projector", "mm")
+        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+        return name
+
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False,
+                help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+                help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
+                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+                help="The clip model is from openclip (for ViT-SO400M type))")
+ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2","adapter"], default="adapter")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
+default_image_mean = [0.5, 0.5, 0.5]
+default_image_std = [0.5, 0.5, 0.5]
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+
+# with proper
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
+    exit(1)
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+    vocab = None
+    tokens = None
+else:
+    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+        vocab = json.load(f)
+        tokens = [key for key in vocab]
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    config = json.load(f)
+    if args.clip_model_is_vision:
+        v_hparams = config
+        t_hparams = None
+    else:
+        v_hparams = config["vision_config"]
+        t_hparams = None
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+vision_config = SiglipVisionConfig(**v_hparams)
+model = SiglipVisionModel(vision_config)
+model.load_state_dict(torch.load(os.path.join(dir_model, "glm.clip")))
+
+fname_middle = None
+has_text_encoder = False
+has_vision_encoder = True
+has_glm_projector = True
+if args.text_only:
+    fname_middle = "text-"
+    has_vision_encoder = False
+elif args.llava_projector is not None:
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_glm_projector = True
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
+else:
+    fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_glm_projector", has_glm_projector)
+fout.add_file_type(ftype)
+model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
+fout.add_name(model_name)
+if has_glm_projector:
+    fout.add_description("image encoder for glm4v")
+    fout.add_string("clip.projector_type", "adapter")
+else:
+    fout.add_description("two-tower CLIP model")
+
+if has_text_encoder:
+    assert t_hparams is not None
+    assert tokens is not None
+    # text_model hparams
+    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
+    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
+    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
+    fout.add_token_list(tokens)
+
+if has_vision_encoder:
+    # vision_model hparams
+    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
+    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
+    fout.add_uint32("clip.vision.projection_dim", 0)
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), v_hparams["num_hidden_layers"])
+
+    image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+    image_std = args.image_std if args.image_std is not None else default_image_std
+    fout.add_array("clip.vision.image_mean", image_mean)
+    fout.add_array("clip.vision.image_std", image_std)
+
+fout.add_bool("clip.use_gelu", True)
+
+
+if has_glm_projector:
+    # model.vision_model.encoder.layers.pop(-1)  # pyright: ignore[reportAttributeAccessIssue]
+    projector = torch.load(args.llava_projector)
+    for name, data in projector.items():
+        name = get_tensor_name(name)
+        # pw and dw conv ndim==4
+        if data.ndim == 2 or data.ndim == 4:
+            data = data.squeeze().numpy().astype(np.float16)
+        else:
+            data = data.squeeze().numpy().astype(np.float32)
+        if name.startswith("vision."):
+            name=name.replace("vision.","")
+        fout.add_tensor(name, data)
+        print(f"Projector {name} - {data.dtype} - shape = {data.shape}")
+        # print(f"Projector {name} tensors added\n")
+
+state_dict = model.state_dict()  # pyright: ignore[reportAttributeAccessIssue]
+for name, data in state_dict.items():
+    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_glm_projector):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    name = get_tensor_name(name)
+    data = data.squeeze().numpy()
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            # print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            # print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            # print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    print(f"siglip {name} - {data.dtype} - shape = {data.shape}")
+    # print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
diff --git a/llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py b/llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py
new file mode 100644
index 0000000..16bb915
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py
@@ -0,0 +1,33 @@
+import argparse
+import os
+import torch
+from transformers import AutoModel
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to GLM model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
+checkpoint = model.state_dict()
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.adapter.")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+torch.save(projector, f"{args.model}/glm.projector")
+
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.vit.model.vision_model.")]
+if len(clip_tensors) > 0:
+    clip = {name.replace("vision.vit.model.", ""): checkpoint[name].float() for name in clip_tensors}
+    torch.save(clip, f"{args.model}/glm.clip")
+
+    # added tokens should be removed to be able to convert Mistral models
+    if os.path.exists(f"{args.model}/added_tokens.json"):
+        with open(f"{args.model}/added_tokens.json", "w") as f:
+            f.write("{}\n")
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}glm.projector to prepare a glm-encoder.gguf file.")
diff --git a/llama.cpp/tools/mtmd/legacy-models/llava_surgery.py b/llama.cpp/tools/mtmd/legacy-models/llava_surgery.py
new file mode 100644
index 0000000..4f2da3b
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/llava_surgery.py
@@ -0,0 +1,38 @@
+import argparse
+import glob
+import os
+import torch
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1]
+checkpoint = torch.load(path)
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+torch.save(projector, f"{args.model}/llava.projector")
+
+# BakLLaVA models contain CLIP tensors in it
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
+if len(clip_tensors) > 0:
+    clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
+    torch.save(clip, f"{args.model}/llava.clip")
+
+
+    # added tokens should be removed to be able to convert Mistral models
+    if os.path.exists(f"{args.model}/added_tokens.json"):
+        with open(f"{args.model}/added_tokens.json", "w") as f:
+            f.write("{}\n")
+
+
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
diff --git a/llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py b/llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py
new file mode 100644
index 0000000..b07c3e3
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py
@@ -0,0 +1,180 @@
+import argparse
+import glob
+import os
+import torch
+from safetensors import safe_open
+from safetensors.torch import save_file
+from typing import Any, ContextManager, cast
+
+# Function to determine if file is a SafeTensor file
+def is_safetensor_file(file_path):
+    return file_path.endswith('.safetensors')
+
+
+# Unified loading function
+def load_model(file_path):
+    if is_safetensor_file(file_path):
+        tensors = {}
+        with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f:
+            for key in f.keys():
+                tensors[key] = f.get_tensor(key).clone()
+                # output shape
+                print(f"{key} : {tensors[key].shape}")
+        return tensors, 'safetensor'
+    else:
+        return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
+
+
+# Unified saving function
+def save_model(model, file_path, file_type):
+    if file_type == 'safetensor':
+        # safe_save(model, file_path)
+        save_file(model, file_path)
+    else:
+        torch.save(model, file_path)
+
+# Helpers to match weight names from specific components or
+# determine if a saved shard contains that component
+def is_vision_tower(weight_name):
+    return (
+        weight_name.startswith("model.vision_tower") or
+        weight_name.startswith("vit.") or
+        weight_name.startswith("vision_tower")
+    )
+
+def is_newline(weight_name):
+    return (
+        weight_name.startswith("model.image_newline") or
+        weight_name.startswith("image_newline")
+    )
+
+def is_mm_projector(weight_name):
+    return (
+        weight_name.startswith("model.mm_projector") or
+        weight_name.startswith("vision_proj.") or
+        weight_name.startswith("multi_modal_projector")
+    )
+
+def newline_criteria(checkpoint):
+    return any(is_newline(k) for k in checkpoint.keys())
+
+def proj_criteria(checkpoint):
+    return any(is_mm_projector(k) for k in checkpoint.keys())
+
+# Adapted function to clean vision tower from checkpoint
+def clean_vision_tower_from_checkpoint(checkpoint_path):
+    checkpoint, file_type = load_model(checkpoint_path)
+    # file_type = 'pytorch'
+    model_path = os.path.dirname(checkpoint_path)
+    print(f"Searching for vision tower tensors in {checkpoint_path}")
+    clip_tensors = [k for k, v in checkpoint.items() if is_vision_tower(k)]
+
+    if len(clip_tensors) > 0:
+        print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
+        # Adapted for file type
+        clip_path = os.path.join(model_path, "llava.clip")
+
+        if os.path.exists(clip_path):
+            print(f"Loading existing llava.clip from {clip_path}")
+            existing_clip, _ = load_model(clip_path)
+        else:
+            print(f"Creating new llava.clip at {clip_path}")
+            existing_clip = {}
+        # Update existing_clip with new tensors, avoid duplicates
+        for name in clip_tensors:
+            simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
+            print(f"Adding {simple_name} to llava.clip")
+            if simple_name not in existing_clip:
+                existing_clip[simple_name] = checkpoint[name]
+
+        # Save the updated clip tensors back to llava.clip
+        save_model(existing_clip, clip_path, 'pytorch')
+
+        # Remove the tensors from the original checkpoint
+        for name in clip_tensors:
+            del checkpoint[name]
+
+        checkpoint_path = checkpoint_path
+        return True
+    return False
+
+def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
+    newline_checkpoint_path = None
+    projector_checkpoint_path = None
+
+    for path in checkpoint_paths:
+        checkpoint, _ = load_model(path)
+        if newline_criteria(checkpoint) and newline_checkpoint_path is None:
+            newline_checkpoint_path = path
+        if projector(checkpoint):
+            projector_checkpoint_path = path
+
+    return newline_checkpoint_path, projector_checkpoint_path
+
+
+# Command-line interface setup
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model")
+ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
+args = ap.parse_args()
+
+if args.clean_vision_tower:
+    # Generalized to handle both PyTorch and SafeTensors models
+    model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+    # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
+    checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+    for projector_checkpoint_path in checkpoint_paths:
+        print(f"Cleaning {projector_checkpoint_path}")
+        if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
+            print(f"No vision tower found in {projector_checkpoint_path}")
+            # we break once none is found, so far all models append them at the end
+            # break
+    print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
+
+# Now we look for the projector in the last checkpoint
+model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+# last_checkpoint_path = checkpoint_paths[0]
+# first_checkpoint_path = checkpoint_paths[-1]
+newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
+
+print(f"Taking projector from {projector_checkpoint_path}")
+first_mm_tensors = []
+first_checkpoint = None
+if newline_checkpoint_path is not None:
+    print(f"Taking newline from {newline_checkpoint_path}")
+    first_checkpoint, file_type = load_model(newline_checkpoint_path)
+    first_mm_tensors = [k for k, v in first_checkpoint.items() if is_newline(k)]
+
+# Load the checkpoint
+mm_tensors = []
+last_checkpoint = None
+if projector_checkpoint_path is not None:
+    last_checkpoint, file_type = load_model(projector_checkpoint_path)
+    mm_tensors = [k for k, v in last_checkpoint.items() if is_mm_projector(k)]
+
+if len(mm_tensors) == 0:
+    if last_checkpoint is not None:
+        for k, v in last_checkpoint.items():
+            print(k)
+    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.")
+    print("No tensors found. Is this a LLaVA model?")
+    exit()
+
+print(f"Found {len(mm_tensors)} tensors to extract.")
+print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
+# projector = {name: checkpoint.[name].float() for name in mm_tensors}
+projector = {}
+for name in mm_tensors:
+    assert last_checkpoint is not None
+    projector[name] = last_checkpoint[name].float()
+for name in first_mm_tensors:
+    assert first_checkpoint is not None
+    projector[name] = first_checkpoint[name].float()
+
+if len(projector) > 0:
+    save_model(projector, f"{args.model}/llava.projector", 'pytorch')
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
diff --git a/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
new file mode 100644
index 0000000..944037e
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
@@ -0,0 +1,892 @@
+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Siglip model. """
+# Copied from  HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
+
+
+import os
+import math
+import warnings
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import (
+    logging,
+)
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+class SiglipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
+    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    Example:
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
+    >>> configuration = SiglipVisionConfig()
+    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+
+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/siglip-base-patch16-224",
+    # See all SigLIP models at https://huggingface.co/models?filter=siglip
+]
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+        tensor = tensor.to(torch.float16)
+    else:
+        tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+):
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsquently scaled and shifted by the mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    denom = fan_in
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.self_attn = (
+            SiglipAttention(config)
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+class SiglipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SiglipVisionConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = self.config.hidden_size
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SiglipAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, SiglipMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+SIGLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+SIGLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SiglipEncoderLayer`].
+    Args:
+        config: SiglipConfig
+    """
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+class SiglipVisionTransformer(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+    _supports_flash_attn_2 = True
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embedding
+
+import argparse
+import json
+import re
+
+import numpy as np
+from gguf import *
+from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer
+from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+
+
+def add_key_str(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+
+    if has_minicpmv and name in ["visual_projection.weight"]:
+        return True
+
+    if name.startswith("v") and not has_vision:
+        return True
+
+    if name.startswith("t") and not has_text:
+        return True
+
+    return False
+
+
+def get_tensor_name(name: str) -> str:
+    if "projection" in name:
+        return name
+    if "mm_projector" in name:
+        name = name.replace("model.mm_projector", "mm")
+        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+        return name
+
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False,
+                help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+                help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
+                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+                help="The clip model is from openclip (for ViT-SO400M type))")
+ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
+default_image_mean = [0.5, 0.5, 0.5]
+default_image_std = [0.5, 0.5, 0.5]
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6; MiniCPM-o-4.5 use 100045', default=2)
+
+# with proper
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
+    exit(1)
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+# Read config.json to get actual model configuration
+config_path = os.path.join(dir_model, "config.json")
+model_config = {}
+if os.path.isfile(config_path):
+    with open(config_path, "r", encoding="utf-8") as f:
+        model_config = json.load(f)
+    print(f"Loaded config from {config_path}")
+else:
+    print(f"Warning: config.json not found at {config_path}")
+
+# If minicpmv_projector is not specified but the default path exists, use the default path
+if args.minicpmv_projector is None:
+    default_projector_path = os.path.join(dir_model, "minicpmv.projector")
+    if os.path.isfile(default_projector_path):
+        args.minicpmv_projector = default_projector_path
+        print(f"Found default projector file: {default_projector_path}")
+
+# If output_dir is not specified, use model_dir as the default value
+if args.output_dir is None:
+    args.output_dir = dir_model
+
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+    vocab = None
+    tokens = None
+else:
+    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+        vocab = json.load(f)
+        tokens = [key for key in vocab]
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+# if args.clip_model_is_vision or args.clip_model_is_openclip:
+#     model = CLIPVisionModel.from_pretrained(dir_model)
+#     processor = None
+# else:
+#     model = CLIPModel.from_pretrained(dir_model)
+#     processor = CLIPProcessor.from_pretrained(dir_model)
+
+minicpmv_version = args.minicpmv_version
+
+# Use actual config values instead of hardcoded ones
+if model_config:
+    # For the projector/resampler, use the main model's hidden_size
+    emb_dim = model_config.get("hidden_size", 1536)
+
+    # For the vision model, use vision_config values
+    vision_config_dict = model_config.get("vision_config", {})
+    default_vision_config = {
+        "hidden_size": vision_config_dict.get("hidden_size", 1152),
+        "image_size": vision_config_dict.get("image_size", 980),
+        "intermediate_size": vision_config_dict.get("intermediate_size", 4304),
+        "model_type": vision_config_dict.get("model_type", "siglip"),
+        "num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
+        "num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
+        "patch_size": vision_config_dict.get("patch_size", 14),
+    }
+
+    # Use vision model's num_hidden_layers for block_count
+    block_count = vision_config_dict.get("num_hidden_layers", 27)
+
+    print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
+    print(f"Vision config: {default_vision_config}")
+else:
+    # Fallback to original hardcoded logic if config.json not found
+    emb_dim = 4096
+    block_count = 26
+    if minicpmv_version == 1:
+        emb_dim = 2304
+        block_count = 26
+    elif minicpmv_version == 2:
+        emb_dim = 4096
+        block_count = 27
+    elif minicpmv_version == 3:
+        emb_dim = 3584
+        block_count = 27
+    elif minicpmv_version == 4:
+        emb_dim = 3584
+        block_count = 27
+    elif minicpmv_version == 5:
+        emb_dim = 2560
+        block_count = 27
+    elif minicpmv_version == 6:
+        emb_dim = 4096
+        block_count = 27
+    elif minicpmv_version == 100045:
+        emb_dim = 4096
+        block_count = 27
+
+    default_vision_config = {
+            "hidden_size": 1152,
+            "image_size": 980,
+            "intermediate_size": 4304,
+            "model_type": "idefics2",
+            "num_attention_heads": 16,
+            "num_hidden_layers": 27,
+            "patch_size": 14,
+        }
+
+vision_config = Idefics2VisionConfig(**default_vision_config)
+model = Idefics2VisionTransformer(vision_config)
+if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
+    vision_config = SiglipVisionConfig(**default_vision_config)
+    model = SiglipVisionTransformer(vision_config)
+elif minicpmv_version == 4:
+    vision_config = SiglipVisionConfig(**default_vision_config)
+    model = SiglipVisionTransformer(vision_config)
+elif minicpmv_version == 5:
+    default_vision_config["model_type"] = "siglip_vision_model"
+    vision_config = SiglipVisionConfig(**default_vision_config)
+    model = SiglipVisionTransformer(vision_config)
+elif minicpmv_version == 6:
+    default_vision_config["model_type"] = "siglip_vision_model"
+    vision_config = SiglipVisionConfig(**default_vision_config)
+    model = SiglipVisionTransformer(vision_config)
+elif minicpmv_version == 100045:
+    default_vision_config["model_type"] = "siglip_vision_model"
+    vision_config = SiglipVisionConfig(**default_vision_config)
+    model = SiglipVisionTransformer(vision_config)
+
+processor = None
+# if model.attn_pool is not None:
+#     model.attn_pool = torch.nn.Identity()
+
+# model.blocks = model.blocks[:-1]
+model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
+
+fname_middle = None
+has_text_encoder = True
+has_vision_encoder = True
+has_minicpmv_projector = False
+
+if args.text_only:
+    fname_middle = "text-"
+    has_vision_encoder = False
+elif args.minicpmv_projector is not None:
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_minicpmv_projector = True
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
+else:
+    fname_middle = ""
+
+output_dir = args.output_dir
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
+fout.add_file_type(ftype)
+if args.text_only:
+    fout.add_description("text-only CLIP model")
+elif args.vision_only and not has_minicpmv_projector:
+    fout.add_description("vision-only CLIP model")
+elif has_minicpmv_projector:
+    fout.add_description("image encoder for MiniCPM-V")
+    # add projector type
+    fout.add_string("clip.projector_type", "resampler")
+    fout.add_int32("clip.minicpmv_version", minicpmv_version)
+else:
+    fout.add_description("two-tower CLIP model")
+
+if has_vision_encoder:
+    # vision_model hparams - use actual config values
+    vision_image_size = model_config.get("image_size", 448) if model_config else 448
+    vision_patch_size = default_vision_config.get("patch_size", 14)
+    vision_hidden_size = default_vision_config.get("hidden_size", 1152)
+    vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
+    vision_attention_heads = default_vision_config.get("num_attention_heads", 16)
+
+    fout.add_uint32("clip.vision.image_size", vision_image_size)
+    fout.add_uint32("clip.vision.patch_size", vision_patch_size)
+    fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
+    fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
+    fout.add_uint32("clip.vision.projection_dim", 0)
+    fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
+    fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+    fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
+
+    # Add MiniCPM-V specific parameters
+    query_num = model_config.get("query_num", 0) if model_config else 0
+    resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
+    fout.add_uint32("clip.minicpmv_query_num", query_num)
+
+    if processor is not None:
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+    else:
+        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+        image_std = args.image_std if args.image_std is not None else default_image_std
+    fout.add_array("clip.vision.image_mean", image_mean)
+    fout.add_array("clip.vision.image_std", image_std)
+
+use_gelu = True
+fout.add_bool("clip.use_gelu", use_gelu)
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000 ** omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+
+
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_h_size, grid_w_size = grid_size, grid_size
+    else:
+        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
+
+    grid_h = np.arange(grid_h_size, dtype=np.float32)
+    grid_w = np.arange(grid_w_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+def _replace_name_resampler(s, v):
+    if re.match("resampler.pos_embed", s):
+        return {
+            s: v,
+            re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
+        }
+    if re.match("resampler.proj", s):
+        return {
+            re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
+            re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
+        }
+    if re.match("resampler.attn.in_proj_.*", s):
+        return {
+            re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
+            re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
+            re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
+        }
+    return {s: v}
+
+if has_minicpmv_projector:
+    projector = torch.load(args.minicpmv_projector)
+    new_state_dict = {}
+    for k, v in projector.items():
+        kvs = _replace_name_resampler(k, v)
+        for nk, nv in kvs.items():
+            new_state_dict[nk] = nv
+    projector = new_state_dict
+    ftype_cur = 0
+    for name, data in projector.items():
+        name = get_tensor_name(name)
+        data = data.squeeze().numpy()
+
+        n_dims = len(data.shape)
+        if ftype == 1:
+            if name[-7:] == ".weight" and n_dims == 2:
+                print("  Converting to float16")
+                data = data.astype(np.float16)
+                ftype_cur = 1
+            else:
+                print("  Converting to float32")
+                data = data.astype(np.float32)
+                ftype_cur = 0
+        else:
+            if data.dtype != np.float32:
+                print("  Converting to float32")
+                data = data.astype(np.float32)
+                ftype_cur = 0
+
+        fout.add_tensor(name, data)
+        print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+
+    print("Projector tensors added\n")
+
+def _replace_name(s, v):
+    s = "vision_model." + s
+    if re.match("vision_model.embeddings.position_embedding", s):
+        v = v.unsqueeze(0)
+        return {s: v}
+
+    return {s: v}
+
+state_dict = model.state_dict()
+new_state_dict = {}
+for k, v in state_dict.items():
+    kvs = _replace_name(k, v)
+    for nk, nv in kvs.items():
+        new_state_dict[nk] = nv
+state_dict = new_state_dict
+for name, data in state_dict.items():
+    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    name = get_tensor_name(name)
+    data = data.squeeze().numpy()
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
diff --git a/llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py b/llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py
new file mode 100644
index 0000000..5352662
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py
@@ -0,0 +1,47 @@
+import argparse
+import os
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True, torch_dtype=torch.bfloat16)
+checkpoint = model.state_dict()
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True:
+    projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb
+torch.save(projector, f"{args.model}/minicpmv.projector")
+
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
+if len(clip_tensors) > 0:
+    clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors}
+    torch.save(clip, f"{args.model}/minicpmv.clip")
+
+    # added tokens should be removed to be able to convert Mistral models
+    if os.path.exists(f"{args.model}/added_tokens.json"):
+        with open(f"{args.model}/added_tokens.json", "w") as f:
+            f.write("{}\n")
+
+config = model.llm.config
+config.auto_map = {
+    "AutoConfig": "configuration_minicpm.MiniCPMConfig",
+    "AutoModel": "modeling_minicpm.MiniCPMModel",
+    "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
+    "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
+}
+model.llm.save_pretrained(f"{args.model}/model")
+tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+tok.save_pretrained(f"{args.model}/model")
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.")
diff --git a/llama.cpp/tools/mtmd/models/cogvlm.cpp b/llama.cpp/tools/mtmd/models/cogvlm.cpp
new file mode 100644
index 0000000..d5b739c
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/cogvlm.cpp
@@ -0,0 +1,98 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_cogvlm::build() {
+    GGML_ASSERT(model.class_embedding != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+
+    const int n_pos = n_patches + 1; // +1 for [CLS]
+
+    // build input and concatenate class embedding
+    ggml_tensor * inp = build_inp();
+    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+    inp = ggml_add(ctx0, inp, model.position_embeddings);
+    cb(inp, "inp_pos", -1);
+
+    ggml_tensor * inpL = inp;
+
+    for (int il = 0; il < n_layer; il++) {
+        auto & layer = model.layers[il];
+        ggml_tensor * cur = inpL;
+
+        cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+
+        cur = ggml_add(ctx0, cur, layer.qkv_b);
+
+        ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+            cur->nb[1], 0);
+        ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+            cur->nb[1], n_embd * sizeof(float));
+        ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+            cur->nb[1], 2 * n_embd * sizeof(float));
+
+        cb(Qcur, "Qcur", il);
+        cb(Kcur, "Kcur", il);
+        cb(Vcur, "Vcur", il);
+
+        cur = build_attn(layer.o_w, layer.o_b,
+            Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+        cb(cur, "attn_out", il);
+
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+        cb(cur, "attn_post_norm", il);
+
+        cur = ggml_add(ctx0, cur, inpL);
+        inpL = cur;
+
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        cb(cur, "ffn_out", il);
+
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+        cb(cur, "ffn_post_norm", il);
+
+        cur = ggml_add(ctx0, cur, inpL);
+        cb(cur, "layer_out", il);
+        inpL = cur;
+
+    }
+
+    // remove CLS token (like build_llama4 does)
+    ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
+        n_embd, n_patches,
+        ggml_row_size(inpL->type, n_embd), 0);
+
+    // Multiply with mm_model_proj
+    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+
+    // Apply layernorm, weight, bias
+    cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
+
+    // Apply GELU
+    cur = ggml_gelu_inplace(ctx0, cur);
+
+    // Branch 1: multiply with mm_h_to_4h_w
+    ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
+
+    // Branch 2: multiply with mm_gate_w
+    ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
+
+    // Apply silu
+    gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
+
+    // Apply mm_4h_to_h_w
+    cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
+
+    // Concatenate with boi and eoi
+    cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
+    cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/conformer.cpp b/llama.cpp/tools/mtmd/models/conformer.cpp
new file mode 100644
index 0000000..9b1fab4
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/conformer.cpp
@@ -0,0 +1,216 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_conformer::build() {
+    const int n_frames   = img.nx;
+    const int n_pos      = n_frames / 2;
+    const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
+    GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+
+    ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd);
+    ggml_set_name(pos_emb, "pos_emb");
+    ggml_set_input(pos_emb);
+    ggml_build_forward_expand(gf, pos_emb);
+
+    ggml_tensor * inp = build_inp_raw(1);
+
+    auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
+
+    // pre encode, conv subsampling
+    {
+        // layer.0 - conv2d
+        cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
+        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
+        cb(cur, "conformer.pre_encode.conv.{}", 0);
+
+        // layer.1 - relu
+        cur = ggml_relu_inplace(ctx0, cur);
+
+        // layer.2 conv2d dw
+        cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
+        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
+        cb(cur, "conformer.pre_encode.conv.{}", 2);
+
+        // layer.3 conv2d
+        cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
+        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
+        cb(cur, "conformer.pre_encode.conv.{}", 3);
+
+        // layer.4 - relu
+        cur = ggml_relu_inplace(ctx0, cur);
+
+        // layer.5 conv2d dw
+        cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
+        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
+        cb(cur, "conformer.pre_encode.conv.{}", 5);
+
+        // layer.6 conv2d
+        cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
+        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
+        cb(cur, "conformer.pre_encode.conv.{}", 6);
+
+        // layer.7 - relu
+        cur = ggml_relu_inplace(ctx0, cur);
+
+        // flatten channel and frequency axis
+        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
+        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
+
+        // calculate out
+        cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
+        cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
+        cb(cur, "conformer.pre_encode.out", -1);
+    }
+
+    // pos_emb
+    cb(pos_emb, "pos_emb", -1);
+
+    for (int il = 0; il < hparams.n_layer; il++) {
+        const auto & layer = model.layers[il];
+
+        auto * residual = cur;
+
+        cb(cur, "layer.in", il);
+
+        // feed_forward1
+        cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
+        cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
+
+        cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
+                        il);
+        cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
+
+        const auto fc_factor = 0.5f;
+        residual             = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
+
+        // self-attention
+        {
+            cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
+            cb(cur, "conformer.layers.{}.norm_self_att", il);
+
+            ggml_tensor * Qcur     = ggml_mul_mat(ctx0, layer.q_w, cur);
+            Qcur                   = ggml_add(ctx0, Qcur, layer.q_b);
+            Qcur                   = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
+            ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
+            Q_bias_u               = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3);
+            ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
+            Q_bias_v               = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
+
+            // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
+            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+            Kcur               = ggml_add(ctx0, Kcur, layer.k_b);
+            Kcur               = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
+            Kcur               = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+            Vcur               = ggml_add(ctx0, Vcur, layer.v_b);
+            Vcur               = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
+            Vcur               = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
+
+            // build_attn won't fit due to matrix_ac and matrix_bd separation
+            ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
+            matrix_ac               = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
+            cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
+
+            auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
+            cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
+            p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
+            p = ggml_permute(ctx0, p, 0, 2, 1, 3);
+
+            auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
+            matrix_bd        = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
+
+            // rel shift
+            {
+                const auto pos_len = matrix_bd->ne[0];
+                const auto q_len   = matrix_bd->ne[1];
+                const auto h       = matrix_bd->ne[2];
+                matrix_bd          = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
+                matrix_bd          = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
+                matrix_bd          = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
+                matrix_bd          = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
+                                                        matrix_bd->nb[2], matrix_bd->nb[0] * q_len);
+                matrix_bd          = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h);
+            }
+
+            matrix_bd     = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
+                                               matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0);
+            auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
+            scores        = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
+            cb(scores, "conformer.layers.{}.self_attn.id0", il);
+
+            ggml_tensor * attn = ggml_soft_max(ctx0, scores);
+            ggml_tensor * x    = ggml_mul_mat(ctx0, attn, Vcur);
+            x                  = ggml_permute(ctx0, x, 2, 0, 1, 3);
+            x                  = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
+
+            ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
+            out               = ggml_add(ctx0, out, layer.o_b);
+            cb(out, "conformer.layers.{}.self_attn.linear_out", il);
+
+            cur = out;
+        }
+
+        residual = ggml_add(ctx0, residual, cur);
+        cur      = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il);
+        cb(cur, "conformer.layers.{}.norm_conv", il);
+
+        // conv
+        {
+            auto * x = cur;
+            x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
+            x = ggml_add(ctx0, x, layer.conv_pw1_b);
+            cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
+
+            // ggml_glu doesn't support sigmoid
+            // TODO @ngxson : support this ops in ggml
+            {
+                int64_t       d    = x->ne[0] / 2;
+                ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
+                x                  = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
+                x                  = ggml_cont(ctx0, ggml_transpose(ctx0, x));
+            }
+
+            // use ggml_ssm_conv for f32 precision
+            x = ggml_pad(ctx0, x, 4, 0, 0, 0);
+            x = ggml_roll(ctx0, x, 4, 0, 0, 0);
+            x = ggml_pad(ctx0, x, 4, 0, 0, 0);
+            x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
+            x = ggml_add(ctx0, x, layer.conv_dw_b);
+
+            x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
+            x = ggml_silu(ctx0, x);
+
+            // pointwise_conv2
+            x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
+            x = ggml_add(ctx0, x, layer.conv_pw2_b);
+
+            cur = x;
+        }
+
+        residual = ggml_add(ctx0, residual, cur);
+
+        cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
+        cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
+
+        cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
+                        FFN_SILU, il);  // TODO(tarek): read activation for ffn from hparams
+        cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
+
+        residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
+        cb(residual, "conformer.layers.{}.conv.id", il);
+
+        cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il);
+        cb(cur, "conformer.layers.{}.norm_out", il);
+    }
+
+    // audio adapter
+    cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
+    cb(cur, "audio_adapter.model.{}", 0);
+    cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
+
+    cb(cur, "projected", -1);
+
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/glm4v.cpp b/llama.cpp/tools/mtmd/models/glm4v.cpp
new file mode 100644
index 0000000..f39b692
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/glm4v.cpp
@@ -0,0 +1,120 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_glm4v::build() {
+    GGML_ASSERT(model.patch_bias != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+    GGML_ASSERT(model.class_embedding == nullptr);
+
+    const int batch_size = 1;
+
+    norm_type norm_t = NORM_TYPE_RMS;
+
+    ggml_tensor * inp_raw = build_inp_raw();
+    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+    // second conv dimension
+    {
+        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+        inp = ggml_add(ctx0, inp, inp_1);
+
+        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+        inp = ggml_cont_3d(
+            ctx0, inp,
+            n_embd, n_patches_x * n_patches_y, batch_size);
+    }
+
+    // add patch bias
+    inp = ggml_add(ctx0, inp, model.patch_bias);
+    cb(inp, "patch_bias", -1);
+
+    // pos-conv norm
+    inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
+
+    // calculate absolute position embedding and apply
+    ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
+    learned_pos_embd = ggml_cont_4d(
+        ctx0, learned_pos_embd,
+        n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+    learned_pos_embd = ggml_reshape_4d(
+        ctx0, learned_pos_embd,
+        n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+    learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
+    learned_pos_embd = ggml_cont_3d(
+        ctx0, learned_pos_embd,
+        n_embd, n_patches_x * n_patches_y, batch_size);
+    cb(learned_pos_embd, "learned_pos_embd", -1);
+
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        return ggml_rope_multi(
+                    ctx0, cur, positions, nullptr,
+                    d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
+                    32768, hparams.rope_theta, 1, 0, 1, 32, 1);
+    };
+
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            norm_t,
+                            hparams.ffn_op,
+                            learned_pos_embd,
+                            add_pos);
+
+    cb(cur, "vit_out", -1);
+    // cb(ggml_sum(ctx0, cur), "vit_out_sum", -1);
+
+    // GLM4V projector
+    // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
+
+    // patch merger (downsample)
+    {
+        int n_merge = hparams.n_merge;
+        GGML_ASSERT(n_merge > 0);
+
+        int n_token_out = n_patches / n_merge / n_merge;
+        cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out);
+        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out]
+        cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1);
+        cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out]
+
+        cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
+    }
+
+    // FC projector
+    {
+        cur = ggml_mul_mat(ctx0, model.projection, cur);
+        // default LayerNorm (post_projection_norm)
+        cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
+        cur = ggml_gelu_erf(ctx0, cur);
+        cb(cur, "after_fc_proj", -1);
+    }
+
+    // FFN projector
+    {
+        cur = build_ffn(cur,
+            model.mm_ffn_up_w, model.mm_ffn_up_b,
+            model.mm_ffn_gate_w, model.mm_ffn_gate_b,
+            model.mm_ffn_down_w, model.mm_ffn_down_b,
+            hparams.ffn_op, -1);
+        cb(cur, "after_ffn_proj", -1);
+        // cb(ggml_sum(ctx0, cur), "merged_sum", -1);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/internvl.cpp b/llama.cpp/tools/mtmd/models/internvl.cpp
new file mode 100644
index 0000000..9aded3b
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/internvl.cpp
@@ -0,0 +1,69 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_internvl::build() {
+    GGML_ASSERT(model.class_embedding != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+
+    const int n_pos = n_patches + 1;
+    ggml_tensor * inp = build_inp();
+
+    // add CLS token
+    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+    // The larger models use a different ViT, which uses RMS norm instead of layer norm
+    // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
+    norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
+        ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
+        : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
+
+    ggml_tensor * cur = build_vit(
+                            inp, n_pos,
+                            norm_t,
+                            hparams.ffn_op,
+                            model.position_embeddings,
+                            nullptr);
+
+    // remove CLS token
+    cur = ggml_view_2d(ctx0, cur,
+        n_embd, n_patches,
+        ggml_row_size(cur->type, n_embd), 0);
+
+    // pixel shuffle
+    {
+        const int scale_factor = model.hparams.n_merge;
+        const int bsz    = 1; // batch size, always 1 for now since we don't support batching
+        const int height = n_patches_y;
+        const int width  = n_patches_x;
+        GGML_ASSERT(scale_factor > 0);
+        cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
+        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        cur = ggml_cont_4d(ctx0, cur,
+            n_embd * scale_factor * scale_factor,
+            height / scale_factor,
+            width / scale_factor,
+            bsz);
+        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        // flatten to 2D
+        cur = ggml_cont_2d(ctx0, cur,
+            n_embd * scale_factor * scale_factor,
+            cur->ne[1] * cur->ne[2]);
+    }
+
+    // projector (always using GELU activation)
+    {
+        // projector LayerNorm uses pytorch's default eps = 1e-5
+        // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
+        cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_3_w, model.mm_3_b,
+            FFN_GELU,
+            -1);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/kimik25.cpp b/llama.cpp/tools/mtmd/models/kimik25.cpp
new file mode 100644
index 0000000..cf9f27f
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/kimik25.cpp
@@ -0,0 +1,101 @@
+#include "models.h"
+#include <cstring>
+#include <cmath>
+
+// note: this is similar to clip_graph::resize_position_embeddings, major difference is having
+// the w/h in ne[1] and ne[2] instead of assuming with sqrt. Could try storing the tensor in 2D instead
+// with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3).
+ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) {
+    ggml_tensor * pos_embd = model.position_embeddings;
+    const int height       = img.ny / patch_size;
+    const int width        = img.nx / patch_size;
+    const uint32_t mode    = interpolation_mode;
+
+    GGML_ASSERT(pos_embd);
+
+    const int64_t stored_c = pos_embd->ne[0];  // C = 1152
+    const int64_t orig_w = pos_embd->ne[1];    // W = 64
+    const int64_t orig_h = pos_embd->ne[2];    // H = 64
+
+    GGML_ASSERT(stored_c == n_embd);
+
+    if (height == (int)orig_h && width == (int)orig_w) {
+        // No interpolation needed, just flatten to [C, H*W]
+        return ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);
+    }
+
+    pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3);
+    pos_embd = ggml_interpolate(ctx0, pos_embd, height, width, n_embd, 1, mode);
+    pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3);
+    pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);
+    return pos_embd;
+}
+
+ggml_cgraph * clip_graph_kimik25::build() {
+    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+
+    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    ggml_tensor * learned_pos_embd = resize_position_embeddings_3d(GGML_SCALE_MODE_BICUBIC);
+
+    // Kimi-K2.5 uses interleaved 2D RoPE pattern natively, but
+    // Q / K are permuted during conversion to use split format.
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        cur = build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+        return cur;
+    };
+
+    ggml_tensor * inp = build_inp();
+
+    // I don't know why, but doing this in the build_vit lead to the ggml_add not occurring?
+    // Doing it manually here does work.
+    inp = ggml_add(ctx0, inp, learned_pos_embd);
+
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            nullptr,
+                            add_pos);
+
+    cb(cur, "vit_out", -1);
+
+    {
+        // patch_merger
+        const int scale_factor = model.hparams.n_merge;
+        cur = build_patch_merge_permute(cur, scale_factor);
+
+        // projection norm
+        int proj_inp_dim = cur->ne[0];
+        int n_merged_patches = cur->ne[1];
+        cur = ggml_view_2d(ctx0, cur,
+            n_embd, n_merged_patches * scale_factor * scale_factor,
+            ggml_row_size(cur->type, n_embd), 0);
+        cur = ggml_norm(ctx0, cur, hparams.eps);
+        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+        cur = ggml_view_2d(ctx0, cur,
+            proj_inp_dim, n_merged_patches,
+            ggml_row_size(cur->type, proj_inp_dim), 0);
+        cb(cur, "proj_inp_normed", -1);
+
+        // projection mlp
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU,
+            -1);
+
+        cb(cur, "proj_out", -1);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/kimivl.cpp b/llama.cpp/tools/mtmd/models/kimivl.cpp
new file mode 100644
index 0000000..0a06f50
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/kimivl.cpp
@@ -0,0 +1,63 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_kimivl::build() {
+    // 2D input positions
+    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+
+    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    ggml_tensor * learned_pos_embd = resize_position_embeddings();
+
+    // build ViT with 2D position embeddings
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        // first half is X axis and second half is Y axis
+        return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+    };
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            learned_pos_embd,
+                            add_pos);
+
+    cb(cur, "vit_out", -1);
+
+    {
+        // patch_merger
+        const int scale_factor = model.hparams.n_merge;
+        cur = build_patch_merge_permute(cur, scale_factor);
+
+        // projection norm
+        int proj_inp_dim = cur->ne[0];
+        cur = ggml_view_2d(ctx0, cur,
+            n_embd, cur->ne[1] * scale_factor * scale_factor,
+            ggml_row_size(cur->type, n_embd), 0);
+        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+        cur = ggml_view_2d(ctx0, cur,
+            proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
+            ggml_row_size(cur->type, proj_inp_dim), 0);
+        cb(cur, "proj_inp_normed", -1);
+
+        // projection mlp
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU,
+            -1);
+        cb(cur, "proj_out", -1);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/llama4.cpp b/llama.cpp/tools/mtmd/models/llama4.cpp
new file mode 100644
index 0000000..30d1df5
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/llama4.cpp
@@ -0,0 +1,96 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_llama4::build() {
+    GGML_ASSERT(model.class_embedding != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+
+    const int n_pos = n_patches + 1; // +1 for [CLS]
+
+    // 2D input positions
+    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+
+    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    ggml_tensor * inp = build_inp_raw();
+
+    // Llama4UnfoldConvolution
+    {
+        ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
+                                                patch_size, patch_size, 3, n_embd);
+        inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
+        inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
+        inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
+        cb(inp, "patch_conv", -1);
+    }
+
+    // add CLS token
+    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+    // build ViT with 2D position embeddings
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        // first half is X axis and second half is Y axis
+        // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
+        // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
+        return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+    };
+    ggml_tensor * cur = build_vit(
+                            inp, n_pos,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            model.position_embeddings,
+                            add_pos);
+
+    // remove CLS token
+    cur = ggml_view_2d(ctx0, cur,
+        n_embd, n_patches,
+        ggml_row_size(cur->type, n_embd), 0);
+
+    // pixel shuffle
+    // based on Llama4VisionPixelShuffleMLP
+    // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
+    {
+        const int scale_factor = model.hparams.n_merge;
+        const int bsz = 1; // batch size, always 1 for now since we don't support batching
+        GGML_ASSERT(scale_factor > 0);
+        GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
+        cur = ggml_reshape_4d(ctx0, cur,
+            n_embd * scale_factor,
+            n_patches_x / scale_factor,
+            n_patches_y,
+            bsz);
+        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        cur = ggml_cont_4d(ctx0, cur,
+            n_embd * scale_factor * scale_factor,
+            n_patches_x / scale_factor,
+            n_patches_y / scale_factor,
+            bsz);
+        //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        // flatten to 2D
+        cur = ggml_cont_2d(ctx0, cur,
+            n_embd * scale_factor * scale_factor,
+            n_patches / scale_factor / scale_factor);
+        cb(cur, "pixel_shuffle", -1);
+    }
+
+    // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
+    {
+        cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
+        cur = ggml_gelu(ctx0, cur);
+        cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
+        cur = ggml_gelu(ctx0, cur);
+        cb(cur, "adapter_mlp", -1);
+    }
+
+    // Llama4MultiModalProjector
+    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+    cb(cur, "projected", -1);
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/llava.cpp b/llama.cpp/tools/mtmd/models/llava.cpp
new file mode 100644
index 0000000..0bfb5f0
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/llava.cpp
@@ -0,0 +1,374 @@
+#include "models.h"
+
+// this graph is used by llava, granite and glm
+// due to having embedding_stack (used by granite), we cannot reuse build_vit
+ggml_cgraph * clip_graph_llava::build() {
+    const int batch_size = 1;
+    const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
+
+    GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
+
+    // Calculate the deepest feature layer based on hparams and projector type
+    int max_feature_layer = n_layer;
+    {
+        // Get the index of the second to last layer; this is the default for models that have a llava projector
+        int il_last = hparams.n_layer - 1;
+        int deepest_feature_layer = -1;
+
+        if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+            il_last += 1;
+        }
+
+        // If we set explicit vision feature layers, only go up to the deepest one
+        // NOTE: only used by granite-vision models for now
+        for (const auto & feature_layer : hparams.vision_feature_layer) {
+            if (feature_layer > deepest_feature_layer) {
+                deepest_feature_layer = feature_layer;
+            }
+        }
+        max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
+    }
+
+    ggml_tensor * inp = build_inp();
+
+    // concat class_embeddings and patch_embeddings
+    if (model.class_embedding) {
+        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+    }
+
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
+
+    ggml_tensor * inpL = inp;
+
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
+        cb(inpL, "pre_ln", -1);
+    }
+
+    std::vector<ggml_tensor *> embedding_stack;
+    const auto & vision_feature_layer = hparams.vision_feature_layer;
+
+    // loop over layers
+    for (int il = 0; il < max_feature_layer; il++) {
+        auto & layer = model.layers[il];
+        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+        // If this is an embedding feature layer, save the output.
+        // NOTE: 0 index here refers to the input to the encoder.
+        if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
+            embedding_stack.push_back(cur);
+        }
+
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+        cb(cur, "layer_inp_normed", il);
+
+        // self-attention
+        {
+            ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+            if (layer.q_b) {
+                Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+            }
+
+            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+            if (layer.k_b) {
+                Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+            }
+
+            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+            if (layer.v_b) {
+                Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+            }
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(layer.o_w, layer.o_b,
+                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, inpL);
+
+        inpL = cur; // inpL = residual, cur = hidden_states
+
+        cb(cur, "ffn_inp", il);
+
+        // layernorm2
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+        cb(cur, "ffn_inp_normed", il);
+
+        // ffn
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        cb(cur, "ffn_out", il);
+
+        // residual 2
+        cur = ggml_add(ctx0, inpL, cur);
+        cb(cur, "layer_out", il);
+
+        inpL = cur;
+    }
+
+    // post-layernorm
+    if (model.post_ln_w) {
+        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
+    }
+
+    ggml_tensor * embeddings = inpL;
+
+    // process vision feature layers (used by granite)
+    {
+        // final layer is a vision feature layer
+        if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
+            embedding_stack.push_back(inpL);
+        }
+
+        // If feature layers are explicitly set, stack them (if we have multiple)
+        if (!embedding_stack.empty()) {
+            embeddings = embedding_stack[0];
+            for (size_t i = 1; i < embedding_stack.size(); i++) {
+                embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
+            }
+        }
+    }
+
+    // llava projector (also used by granite)
+    if (hparams.has_llava_projector) {
+        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
+
+        ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+        ggml_set_name(patches, "patches");
+        ggml_set_input(patches);
+
+        // shape [1, 576, 1024]
+        // ne is whcn, ne = [1024, 576, 1, 1]
+        embeddings = ggml_get_rows(ctx0, embeddings, patches);
+
+        // print_tensor_info(embeddings, "embeddings");
+
+        // llava projector
+        if (proj_type == PROJECTOR_TYPE_MLP) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+
+            embeddings = ggml_gelu(ctx0, embeddings);
+            if (model.mm_2_w) {
+                embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+                embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+            }
+        }
+        else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
+            // First LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
+                                model.mm_1_b);
+
+            // GELU activation
+            embeddings = ggml_gelu(ctx0, embeddings);
+
+            // Second linear layer
+            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
+
+            // Second LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
+                                model.mm_4_b);
+        }
+        else if (proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projector
+            int n_patch = 24;
+            ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
+            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
+            mlp_1 = ggml_gelu(ctx0, mlp_1);
+            ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
+            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
+            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
+
+            // block 1
+            ggml_tensor * block_1 = nullptr;
+            {
+                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
+                mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
+                mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
+                // stride = 1, padding = 1, bias is nullptr
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+
+                // layer norm
+                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+
+                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // hardswish
+                ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // residual
+                block_1 = ggml_add(ctx0, mlp_3, block_1);
+            }
+
+            // block_2
+            {
+                // stride = 2
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // layer norm
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // hardswish
+                ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                // not sure the parameters is right for globalAvgPooling
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+
+                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
+                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
+                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
+            }
+            embeddings = block_1;
+        }
+        else if (proj_type == PROJECTOR_TYPE_LDPV2)
+        {
+            int n_patch = 24;
+            ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+            mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
+            mlp_0 = ggml_gelu(ctx0, mlp_0);
+            ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
+            mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
+            // mlp_2 ne = [2048, 576, 1, 1]
+            // // AVG Pool Layer 2*2, strides = 2
+            mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
+            // mlp_2 ne = [576, 2048, 1, 1]
+            mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
+            // mlp_2 ne [24, 24, 2048, 1]
+            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
+            // weight ne = [3, 3, 2048, 1]
+            ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
+            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
+            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
+            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
+            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
+            embeddings = peg_0;
+        }
+        else {
+            GGML_ABORT("fatal error");
+        }
+    }
+
+    // glm projector
+    else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+        size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
+        embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
+        embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
+        embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
+        embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
+        embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
+        embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
+        // GLU
+        {
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+            embeddings = ggml_gelu_inplace(ctx0, embeddings);
+            ggml_tensor * x = embeddings;
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
+            x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
+            embeddings = ggml_swiglu_split(ctx0, embeddings, x);
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
+        }
+        // arrangement of BOI/EOI token embeddings
+        // note: these embeddings are not present in text model, hence we cannot process them as text tokens
+        // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
+        {
+            embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
+            embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
+        }
+    }
+
+    else {
+        GGML_ABORT("llava: unknown projector type");
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/minicpmv.cpp b/llama.cpp/tools/mtmd/models/minicpmv.cpp
new file mode 100644
index 0000000..3594ea2
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/minicpmv.cpp
@@ -0,0 +1,114 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_minicpmv::build() {
+    GGML_ASSERT(model.class_embedding == nullptr);
+    const int n_pos       = n_patches;
+    const int n_embd_proj = n_mmproj_embd;
+
+    // position embeddings for the projector (not for ViT)
+    // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
+    // base frequency omega
+    ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
+    ggml_set_name(omega, "omega");
+    ggml_set_input(omega);
+
+    // 2D input positions (using float for sinusoidal embeddings)
+    ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+    ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    // for selecting learned pos embd, used by ViT
+    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * embeddings = build_vit(
+                            inp, n_pos,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            learned_pos_embd,
+                            nullptr);
+
+    // resampler projector (it is just another transformer)
+
+    ggml_tensor * q = model.mm_model_query;
+    ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
+
+    // norm
+    q = build_norm(q, model.mm_model_ln_q_w,  model.mm_model_ln_q_b,  NORM_TYPE_NORMAL, eps, -1);
+    v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
+
+    // calculate sinusoidal pos embd
+    ggml_tensor * pos_embed = nullptr;
+    {
+        // outer product
+        ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
+        ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
+        ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
+        // sin and cos
+        ggml_tensor * pos_embd_x = ggml_concat(
+            ctx0,
+            ggml_sin(ctx0, theta_x),
+            ggml_cos(ctx0, theta_x),
+            0 // concat on first dim
+        );
+        ggml_tensor * pos_embd_y = ggml_concat(
+            ctx0,
+            ggml_sin(ctx0, theta_y),
+            ggml_cos(ctx0, theta_y),
+            0 // concat on first dim
+        );
+        pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
+    }
+
+    // k = v + pos_embed
+    ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
+
+    // attention
+    {
+        const int d_head = 128;
+        int n_head = n_embd_proj/d_head;
+        // Use actual config value if available, otherwise fall back to hardcoded values
+        int num_query = hparams.minicpmv_query_num;
+        ggml_tensor * Q = ggml_add(ctx0,
+            ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
+            model.mm_model_attn_q_b);
+        ggml_tensor * K = ggml_add(ctx0,
+            ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
+            model.mm_model_attn_k_b);
+        ggml_tensor * V = ggml_add(ctx0,
+            ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
+            model.mm_model_attn_v_b);
+
+        Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
+        K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
+        V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
+
+        cb(Q, "resampler_Q", -1);
+        cb(K, "resampler_K", -1);
+        cb(V, "resampler_V", -1);
+
+        float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
+        embeddings = build_attn(
+            model.mm_model_attn_o_w,
+            model.mm_model_attn_o_b,
+            Q, K, V, nullptr, resampler_kq_scale, -1);
+        cb(embeddings, "resampler_attn_out", -1);
+    }
+    // layernorm
+    embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
+
+    // projection
+    embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/mobilenetv5.cpp b/llama.cpp/tools/mtmd/models/mobilenetv5.cpp
new file mode 100644
index 0000000..593afa1
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/mobilenetv5.cpp
@@ -0,0 +1,451 @@
+#include "models.h"
+
+// Helpers for MobileNetV5 Blocks
+// RMS Norm 2D - normalizes over channels for each spatial position
+ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
+    // inp: [W, H, C, B]
+
+    ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
+    cur = ggml_cont(ctx0, cur);
+    cur = ggml_rms_norm(ctx0, cur, eps);
+
+    if (weight) {
+        cur = ggml_mul(ctx0, cur, weight);
+    }
+
+    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
+    cur = ggml_cont(ctx0, cur);
+
+    return cur;
+}
+
+// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
+ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
+    const int64_t ih = inp->ne[1];  // height
+    const int64_t iw = inp->ne[0];  // width
+
+    // Calculate output size (ceil division)
+    const int64_t oh = (ih + stride_h - 1) / stride_h;
+    const int64_t ow = (iw + stride_w - 1) / stride_w;
+
+    // Calculate padding needed
+    const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
+    const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
+
+    // Split padding asymmetrically
+    const int pad_h_top = pad_h / 2;
+    const int pad_h_bottom = pad_h - pad_h_top;
+    const int pad_w_left = pad_w / 2;
+    const int pad_w_right = pad_w - pad_w_left;
+
+    // Apply padding if needed
+    // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
+    // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
+    if (pad_h > 0 || pad_w > 0) {
+        inp = ggml_pad_ext(ctx0, inp,
+            pad_w_left, pad_w_right,     // width padding (dim 0)
+            pad_h_top, pad_h_bottom,      // height padding (dim 1)
+            0, 0,                         // no channel padding (dim 2)
+            0, 0);                        // no batch padding (dim 3)
+    }
+
+    return inp;
+}
+
+
+// Edge Residual Block (Stage 0)
+ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
+    ggml_tensor * cur = inp;
+
+    // 1. Expansion Conv (3x3)
+    if (stride == 2) {
+        // Case: Downsampling (Block 0)
+        // Replicates Conv2dSame(kernel=3, stride=2)
+        cur = pad_same_2d(cur, 3, 3, stride, stride);
+        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
+    } else {
+        // Case: Normal 3x3 Block (Block 1, 2)
+        // Replicates Conv2d(kernel=3, stride=1, padding=1)
+        cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
+    }
+
+    // BN + Activation
+    if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
+    cur = ggml_gelu(ctx0, cur);
+
+    // 2. Pointwise Linear Conv (1x1)
+    // 1x1 Convs usually have padding=0 and stride=1
+    cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
+    if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
+
+    // 3. Residual Connection
+    // Only apply residual if spatial dimensions and channels match (stride 1)
+    if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
+        cur = ggml_add(ctx0, cur, inp);
+    }
+
+    return cur;
+}
+
+// Universal Inverted Residual Block (Stage 1+)
+ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
+    ggml_tensor * cur = inp;
+
+    // 1. Depthwise Start (Optional)
+    // NOTE: dw_start always has stride=1 (no downsampling here)
+    if (block.dw_start_w) {
+        int k = block.dw_start_w->ne[0]; // 3 or 5
+        int p = k / 2;
+        cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
+        if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
+    }
+
+    // 2. Pointwise Expansion (1x1)
+    if (block.pw_exp_w) {
+        // Standard 1x1 conv, pad=0, stride=1
+        cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
+        if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
+        cur = ggml_gelu(ctx0, cur);
+    }
+
+    // 3. Depthwise Mid (Optional)
+    // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
+    if (block.dw_mid_w) {
+        int k = block.dw_mid_w->ne[0]; // 3 or 5
+
+        if (stride > 1) {
+            // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
+            cur = pad_same_2d(cur, k, k, stride, stride);
+            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
+        } else {
+            // Case: Stride 1 -> Use Standard Symmetric Padding
+            int p = k / 2;
+            cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
+        }
+
+        if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
+        cur = ggml_gelu(ctx0, cur);
+    }
+
+    // 4. Pointwise Projection (1x1)
+    if (block.pw_proj_w) {
+        cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
+        if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
+    }
+
+    // Apply Layer Scaling if present
+    if (block.layer_scale_w) {
+        cur = ggml_mul(ctx0, cur, block.layer_scale_w);
+    }
+
+    // 5. Residual Connection
+    bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
+    bool same_channel = (inp->ne[2] == cur->ne[2]);
+    if (same_spatial && same_channel) {
+        cur = ggml_add(ctx0, cur, inp);
+    }
+
+    return cur;
+}
+
+// Attention Block (MQA)
+ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
+    ggml_tensor * cur = inp;
+
+    // Norm
+    if (block.attn_norm_w) {
+        cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
+    }
+
+    // 1. Q Calculation
+    ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
+
+    // 2. K Calculation (Downsampled)
+    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+    ggml_tensor * k_inp = cur;
+    if (block.attn_k_dw_w) {
+        int k_size = block.attn_k_dw_w->ne[0];  // Usually 3
+        k_inp = pad_same_2d(cur, k_size, k_size, 2, 2);  // Apply SAME padding
+        k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1);  // padding=0
+        if (block.attn_k_norm_w) {
+            k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
+        }
+    }
+    ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
+
+    // 3. V Calculation (Downsampled)
+    // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+    ggml_tensor * v_inp = cur;
+    if (block.attn_v_dw_w) {
+        int v_size = block.attn_v_dw_w->ne[0];  // Usually 3
+        v_inp = pad_same_2d(cur, v_size, v_size, 2, 2);  // Apply SAME padding
+        v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1);  // padding=0
+        if (block.attn_v_norm_w) {
+            v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
+        }
+    }
+    ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
+
+    const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
+    const int D = k->ne[2]; // Head dimension
+    const int n_head = q->ne[2] / D;
+    const int N = W * H;
+
+    // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
+    q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
+    q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
+    q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
+    q = ggml_cont(ctx0, q);
+
+    const int Wk = k->ne[0]; const int Hk = k->ne[1];
+    const int M = Wk * Hk;
+
+    // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
+    k = ggml_reshape_3d(ctx0, k, M, D, B);
+    k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
+    k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
+    k = ggml_cont(ctx0, k);
+
+    // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
+    v = ggml_reshape_3d(ctx0, v, M, D, B);
+    v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
+    v = ggml_cont(ctx0, v); // [M, D, 1, B]
+
+    // Multi-Query Attention
+    float scale = 1.0f / sqrtf((float)D);
+
+    // Step 1: Compute Q @ K.T
+    ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
+
+    scores = ggml_scale(ctx0, scores, scale);
+
+    scores = ggml_soft_max(ctx0, scores);
+
+    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
+
+    kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
+    kqv = ggml_cont(ctx0, kqv);
+
+
+    kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
+    kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
+    kqv = ggml_cont(ctx0, kqv);
+
+    // Output projection
+    cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
+
+    // Residual & Layer Scale
+    if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
+        if (block.layer_scale_w) {
+            cur = ggml_mul(ctx0, cur, block.layer_scale_w);
+        }
+        cur = ggml_add(ctx0, cur, inp);
+    }
+
+    return cur;
+}
+
+ggml_cgraph * clip_graph_mobilenetv5::build() {
+    ggml_tensor * inp = build_inp_raw();
+
+    // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
+    ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2);  // Apply SAME padding
+
+    cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1);  // padding=0
+    if (model.mobilenet_stem_conv_b) {
+        cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
+    }
+    if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
+    cur = ggml_gelu(ctx0, cur);
+
+
+    // 2. Blocks
+    std::vector<ggml_tensor*> intermediate_features;
+    const int total_blocks = model.mobilenet_blocks.size();
+
+    auto is_stage_start = [&](int i) {
+        if (i == 0) return true;
+        for (int end_idx : model.mobilenet_stage_ends) {
+            if (i == end_idx + 1) return true;
+        }
+        return false;
+    };
+
+    auto is_fusion_point = [&](int i) {
+        if (model.mobilenet_stage_ends.size() >= 4) {
+                if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
+                if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
+        } else {
+            if (i == total_blocks - 1) return true;
+        }
+        return false;
+    };
+
+    for (int i = 0; i < total_blocks; i++) {
+        const auto & block = model.mobilenet_blocks[i];
+        int stride = is_stage_start(i) ? 2 : 1;
+
+        if (block.s0_conv_exp_w)      cur = build_edge_residual(cur, block, stride);
+        else if (block.attn_q_w)      cur = build_mobilenet_attn(cur, block);
+        else                          cur = build_inverted_residual(cur, block, stride);
+
+        if (is_fusion_point(i)) {
+
+            intermediate_features.push_back(cur);
+        }
+    }
+
+    // 3. Multi-Scale Fusion Adapter (MSFA)
+    if (!intermediate_features.empty()) {
+
+        // A. Reference Resolution: PyTorch implementation uses inputs[0]
+        // We assume intermediate_features[0] is the "High Resolution" target.
+        // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
+        ggml_tensor* target_feat = intermediate_features[0];
+        int high_res_w = target_feat->ne[0];
+        int high_res_h = target_feat->ne[1];
+
+        std::vector<ggml_tensor*> resized_feats;
+
+        // B. Resize inputs to match inputs[0] (High Resolution)
+        for (auto feat : intermediate_features) {
+            int feat_w = feat->ne[0];
+            int feat_h = feat->ne[1];
+
+            // PyTorch: if feat_size < high_resolution: interpolate
+            if (feat_w < high_res_w || feat_h < high_res_h) {
+                // Calculate scale factor.
+                // Note: PyTorch 'nearest' works on arbitrary float scales.
+                // ggml_upscale generally takes integer factors or target sizes depending on helper.
+                // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
+                int scale_w = high_res_w / feat_w;
+                // int scale_h = high_res_h / feat_h;
+
+                // Safety check for non-integer scaling if strictly replicating
+                GGML_ASSERT(high_res_w % feat_w == 0);
+
+                // Upsample (Nearest Neighbor)
+                // 2 is the scale factor
+                feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
+            }
+            resized_feats.push_back(feat);
+        }
+
+        // C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
+        cur = resized_feats[0];
+        for (size_t k = 1; k < resized_feats.size(); ++k) {
+            cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
+        }
+
+        // D. FFN (UniversalInvertedResidual)
+        // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
+
+        // 1. Expansion
+        if (model.msfa_ffn_expand_w) {
+            // 1x1 Conv
+            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
+
+            if (model.msfa_ffn_expand_bn) {
+                cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
+            }
+
+            cur = ggml_gelu(ctx0, cur);
+
+        }
+
+        // 2. Projection (No DW because kernel_size=0)
+        if (model.msfa_ffn_project_w) {
+            // 1x1 Conv
+            cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
+
+            // UniversalInvertedResidual typically has a norm after projection
+            if (model.msfa_ffn_project_bn) {
+                cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
+            }
+
+        }
+
+        // E. Final Downsample to Target Resolution (Output Resolution)
+        // PyTorch: matches self.output_resolution (e.g. 16x16)
+        const int target_out_res = 16;
+        int current_w = cur->ne[0];
+
+        if (current_w > target_out_res) {
+            int s = current_w / target_out_res;
+
+            GGML_ASSERT(current_w % target_out_res == 0);
+
+            // Avg Pool: Kernel=s, Stride=s
+            cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
+
+        }
+
+        // F. Final Norm
+        if (model.msfa_concat_norm_w) {
+            cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
+
+        }
+    }
+
+    // 4. Gemma 3n Multimodal Projection (Embedder)
+    // Input: 'cur' is [Width, Height, Channels, Batch]
+    int W = cur->ne[0];
+    int H = cur->ne[1];
+    int C = cur->ne[2];
+    int B = cur->ne[3];
+
+    GGML_ASSERT(C == hparams.n_embd);
+
+    // 1. Permute and Flatten to [Channels, Tokens, Batch]
+    // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
+    cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
+    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
+    cur = ggml_cont(ctx0, cur);
+    cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
+    cur = ggml_cont(ctx0, cur);
+
+
+    // 2. FEATURE SCALING
+    // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
+    const float scale_factor = sqrtf((float)C);
+    cur = ggml_scale(ctx0, cur, scale_factor);
+
+
+    // 3. SOFT EMBEDDING NORM
+    // PyTorch: self._norm(x) * self.weight
+    // We must normalize regardless, then multiply if weight exists.
+    {
+        const float eps = 1e-6f; // Gemma3n uses 1e-6
+        cur = ggml_rms_norm(ctx0, cur, eps);
+
+        if (model.mm_soft_emb_norm_w) {
+            // Weight shape is (2048,) -> Element-wise broadcast multiply
+            cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
+        }
+
+    }
+
+    // 4. PROJECTION
+    // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
+    // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
+    if (model.mm_input_proj_w) {
+        cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
+    }
+
+    // 5. POST PROJECTION NORM
+    // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
+    // with_scale=False means weight is registered as buffer with value 1.0
+    // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
+    {
+        const float eps = 1e-6f;
+        cur = ggml_rms_norm(ctx0, cur, eps);
+
+        if (model.mm_post_proj_norm_w) {
+            // If weight is loaded, multiply (should be ~1.0 anyway)
+            cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
+        }
+    }
+
+    ggml_build_forward_expand(gf, cur);
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/models.h b/llama.cpp/tools/mtmd/models/models.h
new file mode 100644
index 0000000..c4c67ac
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/models.h
@@ -0,0 +1,118 @@
+#pragma once
+
+#include "../clip-graph.h"
+
+/*
+ * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
+ * We encourage human contributors to ensure the quality and reliability of the codebase.
+ */
+
+struct clip_graph_siglip : clip_graph {
+    clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_pixtral : clip_graph {
+    clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_qwen2vl : clip_graph {
+    clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_qwen3vl : clip_graph {
+    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_youtuvl : clip_graph {
+    clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_minicpmv : clip_graph {
+    clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_internvl : clip_graph {
+    clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_llama4 : clip_graph {
+    clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_kimivl : clip_graph {
+    clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_cogvlm : clip_graph {
+    clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_llava : clip_graph {
+    clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_whisper_enc : clip_graph {
+    clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_conformer : clip_graph {
+    clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_glm4v : clip_graph {
+    clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_mobilenetv5 : clip_graph {
+    clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+
+    ggml_tensor * rms_norm_2d(
+        ggml_tensor * inp,
+        ggml_tensor * weight,
+        float eps = 1e-6f);
+
+    ggml_tensor* pad_same_2d(
+        ggml_tensor* inp,
+        int kernel_h,
+        int kernel_w,
+        int stride_h,
+        int stride_w,
+        int dilation_h = 1,
+        int dilation_w = 1);
+
+    ggml_tensor * build_edge_residual(
+        ggml_tensor * inp,
+        const mobilenetv5_block & block,
+        int stride);
+
+    ggml_tensor * build_inverted_residual(
+        ggml_tensor * inp,
+        const mobilenetv5_block & block,
+        int stride);
+
+    ggml_tensor * build_mobilenet_attn(
+        ggml_tensor * inp,
+        const mobilenetv5_block & block);
+};
+
+struct clip_graph_kimik25 : clip_graph {
+    clip_graph_kimik25(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+
+    ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode);
+};
diff --git a/llama.cpp/tools/mtmd/models/pixtral.cpp b/llama.cpp/tools/mtmd/models/pixtral.cpp
new file mode 100644
index 0000000..a849210
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/pixtral.cpp
@@ -0,0 +1,86 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_pixtral::build() {
+    const int n_merge = hparams.n_merge;
+
+    // 2D input positions
+    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+
+    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
+    };
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            NORM_TYPE_RMS,
+                            hparams.ffn_op,
+                            nullptr, // no learned pos embd
+                            add_pos);
+
+    // mistral small 3.1 patch merger
+    // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
+    if (model.mm_patch_merger_w) {
+        GGML_ASSERT(hparams.n_merge > 0);
+
+        cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
+
+        // reshape image tokens to 2D grid
+        cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
+        cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
+        cur = ggml_cont(ctx0, cur);
+
+        // torch.nn.functional.unfold is just an im2col under the hood
+        // we just need a dummy kernel to make it work
+        ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
+        cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
+
+        // project to n_embd
+        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
+        cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
+    }
+
+    // LlavaMultiModalProjector (always using GELU activation)
+    {
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU,
+            -1);
+    }
+
+    // arrangement of the [IMG_BREAK] token
+    if (model.token_embd_img_break) {
+        // not efficient, but works
+        // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
+        // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
+        // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
+
+        const int p_y             = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
+        const int p_x             = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
+        const int p_total         = p_x * p_y;
+        const int n_embd_text     = cur->ne[0];
+        const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
+
+        ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
+        ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
+        tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
+        tok = ggml_add(ctx0, tok, model.token_embd_img_break);
+        tmp = ggml_concat(ctx0, tmp, tok, 1);
+        cur = ggml_view_2d(ctx0, tmp,
+            n_embd_text, n_tokens_output,
+            ggml_row_size(tmp->type, n_embd_text), 0);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/qwen2vl.cpp b/llama.cpp/tools/mtmd/models/qwen2vl.cpp
new file mode 100644
index 0000000..85f158b
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/qwen2vl.cpp
@@ -0,0 +1,183 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_qwen2vl::build() {
+    GGML_ASSERT(model.patch_bias == nullptr);
+    GGML_ASSERT(model.class_embedding == nullptr);
+
+    const int batch_size       = 1;
+    const bool use_window_attn = hparams.n_wa_pattern > 0;
+    const int n_wa_pattern     = hparams.n_wa_pattern;
+    const int n_pos            = n_patches;
+    const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+    norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL
+        ? NORM_TYPE_RMS // qwen 2.5 vl
+        : NORM_TYPE_NORMAL; // qwen 2 vl
+
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+    ggml_tensor * inp_raw = build_inp_raw();
+    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+    // second conv dimension
+    {
+        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+        inp = ggml_add(ctx0, inp, inp_1);
+
+        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+        inp = ggml_cont_3d(
+            ctx0, inp,
+            n_embd, n_patches_x * n_patches_y, batch_size);
+    }
+
+    ggml_tensor * inpL           = inp;
+    ggml_tensor * window_mask    = nullptr;
+    ggml_tensor * window_idx     = nullptr;
+    ggml_tensor * inv_window_idx = nullptr;
+
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+    }
+
+    if (use_window_attn) {
+        // handle window attention inputs
+        inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+        ggml_set_name(inv_window_idx, "inv_window_idx");
+        ggml_set_input(inv_window_idx);
+        // mask for window attention
+        window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
+        ggml_set_name(window_mask, "window_mask");
+        ggml_set_input(window_mask);
+
+        // if flash attn is used, we need to pad the mask and cast to f16
+        if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+            window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
+        }
+
+        // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+        GGML_ASSERT(batch_size == 1);
+        inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
+        inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
+        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
+    }
+
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        const auto & layer = model.layers[il];
+        const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
+
+        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+        cb(cur, "ln1", il);
+
+        // self-attention
+        {
+            ggml_tensor * Qcur = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
+            ggml_tensor * Kcur = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
+            ggml_tensor * Vcur = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            // apply M-RoPE
+            Qcur = ggml_rope_multi(
+                ctx0, Qcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            Kcur = ggml_rope_multi(
+                ctx0, Kcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+            cb(Qcur, "Qcur_rope", il);
+            cb(Kcur, "Kcur_rope", il);
+
+            ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
+
+            cur = build_attn(layer.o_w, layer.o_b,
+                Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, inpL);
+
+        inpL = cur; // inpL = residual, cur = hidden_states
+
+        cb(cur, "ffn_inp", il);
+
+        // layernorm2
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+        cb(cur, "ffn_inp_normed", il);
+
+        // ffn
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        cb(cur, "ffn_out", il);
+
+        // residual 2
+        cur = ggml_add(ctx0, inpL, cur);
+        cb(cur, "layer_out", il);
+
+        inpL = cur;
+    }
+
+    // post-layernorm
+    if (model.post_ln_w) {
+        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+    }
+
+    // multimodal projection
+    ggml_tensor * embeddings = inpL;
+    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+    embeddings = build_ffn(embeddings,
+                        model.mm_0_w, model.mm_0_b,
+                        nullptr, nullptr,
+                        model.mm_1_w, model.mm_1_b,
+                        FFN_GELU,
+                        -1);
+
+    if (use_window_attn) {
+        window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+        ggml_set_name(window_idx, "window_idx");
+        ggml_set_input(window_idx);
+
+        // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+        GGML_ASSERT(batch_size == 1);
+        embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
+        embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
+        embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/qwen3vl.cpp b/llama.cpp/tools/mtmd/models/qwen3vl.cpp
new file mode 100644
index 0000000..5ecb10f
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/qwen3vl.cpp
@@ -0,0 +1,193 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_qwen3vl::build() {
+    GGML_ASSERT(model.patch_bias != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+    GGML_ASSERT(model.class_embedding == nullptr);
+
+    const int batch_size       = 1;
+    const int n_pos            = n_patches;
+    const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+    norm_type norm_t = NORM_TYPE_NORMAL;
+
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+    ggml_tensor * inp_raw = build_inp_raw();
+    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+    // second conv dimension
+    {
+        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+        inp = ggml_add(ctx0, inp, inp_1);
+
+        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+        inp = ggml_cont_3d(
+            ctx0, inp,
+            n_embd, n_patches_x * n_patches_y, batch_size);
+    }
+
+    // add patch bias
+    if (model.patch_bias != nullptr) {
+        inp = ggml_add(ctx0, inp, model.patch_bias);
+        cb(inp, "patch_bias", -1);
+    }
+
+    // calculate absolute position embedding and apply
+    ggml_tensor * learned_pos_embd = resize_position_embeddings();
+    learned_pos_embd = ggml_cont_4d(
+        ctx0, learned_pos_embd,
+        n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+    learned_pos_embd = ggml_reshape_4d(
+        ctx0, learned_pos_embd,
+        n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+    learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
+    learned_pos_embd = ggml_cont_3d(
+        ctx0, learned_pos_embd,
+        n_embd, n_patches_x * n_patches_y, batch_size);
+    inp = ggml_add(ctx0, inp, learned_pos_embd);
+    cb(inp, "inp_pos_emb", -1);
+
+    ggml_tensor * inpL = inp;
+
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+    }
+
+    // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
+    ggml_tensor * deepstack_features = nullptr;
+    const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
+
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        auto & layer = model.layers[il];
+
+        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+        cb(cur, "ln1", il);
+
+        // self-attention
+        {
+            cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+            cur = ggml_add(ctx0, cur, layer.qkv_b);
+
+            ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ 0);
+
+            ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ ggml_row_size(cur->type, n_embd));
+
+            ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ ggml_row_size(cur->type, 2 * n_embd));
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            // apply M-RoPE
+            Qcur = ggml_rope_multi(
+                ctx0, Qcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            Kcur = ggml_rope_multi(
+                ctx0, Kcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+            cb(Qcur, "Qcur_rope", il);
+            cb(Kcur, "Kcur_rope", il);
+
+            cur = build_attn(layer.o_w, layer.o_b,
+                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, inpL);
+
+        inpL = cur; // inpL = residual, cur = hidden_states
+
+        cb(cur, "ffn_inp", il);
+
+        // layernorm2
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+        cb(cur, "ffn_inp_normed", il);
+
+        // ffn
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        cb(cur, "ffn_out", il);
+
+        // residual 2
+        cur = ggml_add(ctx0, inpL, cur);
+        cb(cur, "layer_out", il);
+
+        if (layer.has_deepstack()) {
+            ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
+            feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
+            feat = build_ffn(feat,
+                layer.deepstack_fc1_w, layer.deepstack_fc1_b,
+                nullptr, nullptr,
+                layer.deepstack_fc2_w, layer.deepstack_fc2_b,
+                ffn_op_type::FFN_GELU, il);
+
+            if(!deepstack_features) {
+                deepstack_features = feat;
+            } else {
+                // concat along the feature dimension
+                deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
+            }
+        }
+
+        inpL = cur;
+    }
+
+    // post-layernorm
+    if (model.post_ln_w) {
+        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+    }
+
+    // multimodal projection
+    ggml_tensor * embeddings = inpL;
+    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+
+    embeddings = build_ffn(embeddings,
+        model.mm_0_w, model.mm_0_b,
+        nullptr, nullptr,
+        model.mm_1_w, model.mm_1_b,
+        ffn_op_type::FFN_GELU, -1);
+
+    if (deepstack_features) {
+        embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0);
+    } // concat along the feature dimension
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/siglip.cpp b/llama.cpp/tools/mtmd/models/siglip.cpp
new file mode 100644
index 0000000..b866a11
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/siglip.cpp
@@ -0,0 +1,86 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_siglip::build() {
+    ggml_tensor * inp = build_inp();
+
+    ggml_tensor * learned_pos_embd = model.position_embeddings;
+    if (proj_type == PROJECTOR_TYPE_LFM2) {
+        learned_pos_embd = resize_position_embeddings();
+    }
+
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            learned_pos_embd,
+                            nullptr);
+
+    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
+        const int batch_size = 1;
+        GGML_ASSERT(n_patches_x == n_patches_y);
+        const int patches_per_image = n_patches_x;
+        const int kernel_size = hparams.n_merge;
+
+        cur = ggml_transpose(ctx0, cur);
+        cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
+
+        // doing a pool2d to reduce the number of output tokens
+        cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
+        cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        // apply norm before projection
+        cur = ggml_rms_norm(ctx0, cur, eps);
+        cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
+
+        // apply projection
+        cur = ggml_mul_mat(ctx0,
+            ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
+            cur);
+
+    } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
+        // pixel_shuffle
+        // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
+        const int scale_factor = model.hparams.n_merge;
+        cur = build_patch_merge_permute(cur, scale_factor);
+        cur = ggml_mul_mat(ctx0, model.projection, cur);
+
+    } else if (proj_type == PROJECTOR_TYPE_LFM2) {
+        // pixel unshuffle block
+        const int scale_factor = model.hparams.n_merge;
+        cur = build_patch_merge_permute(cur, scale_factor);
+
+        // projection, in LFM2-VL input norm is optional
+        if (model.mm_input_norm_w) {
+            cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+            cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        }
+
+        if (model.mm_input_norm_b) {
+            cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+        }
+
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU,
+            -1);
+
+    } else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) {
+        cur = build_ffn(cur,
+            model.mm_0_w, model.mm_0_b,
+            nullptr, nullptr,
+            model.mm_1_w, model.mm_1_b,
+            hparams.ffn_op,
+            -1);
+
+    } else {
+        GGML_ABORT("SigLIP: Unsupported projector type");
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/whisper-enc.cpp b/llama.cpp/tools/mtmd/models/whisper-enc.cpp
new file mode 100644
index 0000000..2f2b127
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/whisper-enc.cpp
@@ -0,0 +1,115 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_whisper_enc::build() {
+    const int n_frames = img.nx;
+    const int n_pos    = n_frames / 2;
+    GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+
+    ggml_tensor * inp = build_inp_raw(1);
+
+    // conv1d block
+    {
+        // convolution + gelu
+        ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
+        cur = ggml_add(ctx0, cur, model.conv1d_1_b);
+
+        cur = ggml_gelu_erf(ctx0, cur);
+
+        cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
+        cur = ggml_add(ctx0, cur, model.conv1d_2_b);
+
+        cur = ggml_gelu_erf(ctx0, cur);
+        // transpose
+        inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+        cb(inp, "after_conv1d", -1);
+    }
+
+    // sanity check (only check one layer, but it should be the same for all)
+    GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
+    GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
+    GGML_ASSERT(model.layers[0].q_b);
+    GGML_ASSERT(model.layers[0].v_b);
+    GGML_ASSERT(!model.layers[0].k_b); // no bias for k
+
+    ggml_tensor * pos_embd_selected = ggml_view_2d(
+        ctx0, model.position_embeddings,
+        model.position_embeddings->ne[0], n_pos,
+        model.position_embeddings->nb[1], 0
+    );
+    ggml_tensor * cur = build_vit(
+                            inp, n_pos,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            pos_embd_selected,
+                            nullptr);
+
+    cb(cur, "after_transformer", -1);
+
+    if (model.audio_has_stack_frames()) {
+        // StackAudioFrames
+        // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
+        cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
+        cb(cur, "after_stacked", -1);
+    }
+
+    if (proj_type == PROJECTOR_TYPE_ULTRAVOX) {
+        // UltravoxProjector
+        // pre-norm
+        cur = ggml_rms_norm(ctx0, cur, 1e-6);
+        cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
+
+        // ffn in
+        cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
+
+        // swiglu
+        // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
+        cur = ggml_swiglu_swapped(ctx0, cur);
+
+        // mid-norm
+        cur = ggml_rms_norm(ctx0, cur, 1e-6);
+        cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
+
+        // ffn out
+        cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
+
+    } else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
+        // projector
+        cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
+        cur = ggml_add(ctx0, cur, model.mm_fc_b);
+
+    } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
+        // projector
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU_ERF,
+            -1);
+
+    } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
+        // projector
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU_ERF,
+            -1);
+
+    } else if (proj_type == PROJECTOR_TYPE_GLMA) {
+            cur = ggml_norm(ctx0, cur, hparams.eps);
+            cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
+            cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
+            cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
+            cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
+            cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
+            cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
+    } else {
+        GGML_ABORT("%s: unknown projector type", __func__);
+    }
+
+    cb(cur, "projected", -1);
+
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/models/youtuvl.cpp b/llama.cpp/tools/mtmd/models/youtuvl.cpp
new file mode 100644
index 0000000..ffbf2be
--- /dev/null
+++ b/llama.cpp/tools/mtmd/models/youtuvl.cpp
@@ -0,0 +1,179 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_youtuvl::build() {
+    GGML_ASSERT(model.class_embedding == nullptr);
+    const int batch_size       = 1;
+    const bool use_window_attn = !hparams.wa_layer_indexes.empty();
+    const int n_pos            = n_patches;
+    const int num_position_ids = n_pos * 4;
+    const int m = 2;
+    const int Wp = n_patches_x;
+    const int Hp = n_patches_y;
+    const int Hm = Hp / m;
+    const int Wm = Wp / m;
+    norm_type norm_t = NORM_TYPE_NORMAL;
+
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+    ggml_tensor * inp = build_inp_raw();
+
+    // change conv3d to linear
+    // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
+    {
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            Wm * m * patch_size, m * patch_size, Hm, 3);
+        inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            m * patch_size * 3, Wm, m * patch_size, Hm);
+
+        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            m * patch_size * 3, patch_size, m, Hm * Wm);
+
+        inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            patch_size, 3, patch_size, Hm * Wm * m * m);
+
+        inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
+        inp = ggml_cont_3d(
+            ctx0, inp,
+            3*patch_size* patch_size,  Hm * Wm * m * m, 1);
+    }
+    inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
+
+    if (model.patch_bias) {
+        inp = ggml_add(ctx0, inp, model.patch_bias);
+    }
+
+    inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
+
+    ggml_tensor * inpL           = inp;
+    ggml_tensor * window_mask    = nullptr;
+    ggml_tensor * window_idx     = nullptr;
+    ggml_tensor * inv_window_idx = nullptr;
+
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+    }
+    if (use_window_attn) {
+        inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+        ggml_set_name(inv_window_idx, "inv_window_idx");
+        ggml_set_input(inv_window_idx);
+        // mask for window attention
+        window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
+        ggml_set_name(window_mask, "window_mask");
+        ggml_set_input(window_mask);
+
+        // if flash attn is used, we need to pad the mask and cast to f16
+        if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+            window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
+        }
+
+        // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+        GGML_ASSERT(batch_size == 1);
+        inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
+        inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
+        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
+    }
+
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        const auto & layer = model.layers[il];
+        const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
+
+        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+        // self-attention
+        {
+            ggml_tensor * Qcur = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
+            ggml_tensor * Kcur = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
+            ggml_tensor * Vcur = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
+
+            Qcur = ggml_rope_multi(
+                ctx0, Qcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            Kcur = ggml_rope_multi(
+                ctx0, Kcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+            ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
+
+            cur = build_attn(layer.o_w, layer.o_b,
+                Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
+        }
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, inpL);
+
+        inpL = cur; // inpL = residual, cur = hidden_states
+
+        // layernorm2
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+
+        // ffn
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            nullptr, nullptr,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        // residual 2
+        cur = ggml_add(ctx0, inpL, cur);
+
+        inpL = cur;
+    }
+
+    ggml_tensor * embeddings = inpL;
+    if (use_window_attn) {
+        const int spatial_merge_unit = 4;
+        window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
+        ggml_set_name(window_idx, "window_idx");
+        ggml_set_input(window_idx);
+        GGML_ASSERT(batch_size == 1);
+        embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
+        embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
+        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
+        cb(embeddings, "window_order_restored", -1);
+    }
+
+    // post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
+    if (model.post_ln_w) {
+        embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+    }
+
+    // Now apply merger (VLPatchMerger):
+    // 1. Apply RMS norm (ln_q in VLPatchMerger)
+    embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
+    cb(embeddings, "merger_normed", -1);
+
+    // 2. First reshape for spatial merge (merge 2x2 patches)
+    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+    cb(embeddings, "merger_reshaped", -1);
+
+    embeddings = build_ffn(embeddings,
+                    model.mm_0_w, model.mm_0_b,
+                    nullptr, nullptr,
+                    model.mm_1_w, model.mm_1_b,
+                    FFN_GELU,
+                    -1);
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
diff --git a/llama.cpp/tools/mtmd/mtmd-audio.cpp b/llama.cpp/tools/mtmd/mtmd-audio.cpp
new file mode 100644
index 0000000..e8eef03
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd-audio.cpp
@@ -0,0 +1,730 @@
+#include "mtmd-audio.h"
+
+#define _USE_MATH_DEFINES // for M_PI
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <thread>
+#include <vector>
+#include <fstream>
+#include <algorithm>
+
+// some of the code here is copied from whisper.cpp
+
+constexpr bool DEBUG = false;
+
+void mtmd_audio_cache::fill_sin_cos_table(int n) {
+    sin_vals.resize(n);
+    cos_vals.resize(n);
+    for (int i = 0; i < n; i++) {
+        double theta = (2 * M_PI * i) / n;
+        sin_vals[i]  = sinf(theta);
+        cos_vals[i]  = cosf(theta);
+    }
+}
+
+void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
+    hann_window.resize(length);
+    int offset = -1;
+    if (periodic) {
+        offset = 0;
+    }
+    for (int i = 0; i < length; i++) {
+        hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+    }
+}
+
+void mtmd_audio_cache::fill_mel_filterbank_matrix(int   n_mel,
+                                                  int   n_fft,
+                                                  int   sample_rate,
+                                                  float fmin,
+                                                  float fmax,
+                                                  bool  slaney_area_norm,
+                                                  float scale) {
+    GGML_ASSERT(n_mel > 0 && n_fft > 1);
+    if (fmax <= 0.0f) {
+        fmax = 0.5f * sample_rate;
+    }
+
+    // Slaney scale (matches librosa default)
+    const double min_log_hz  = 1000.0;
+    const double lin_slope   = 3 / 200.;
+    const double min_log_mel = min_log_hz * lin_slope;
+    const double log_step    = log(6.4) / 27.0;
+    auto         hz_to_mel   = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
+        return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
+    };
+    auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
+        return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
+    };
+
+    // infer N_fft from n_fft_bins
+    const double bin_hz_step = double(sample_rate) / double(n_fft);
+
+    // mel grid: n_mel + 2 edges
+    const double        m_lo = hz_to_mel(fmin);
+    const double        m_hi = hz_to_mel(fmax);
+    std::vector<double> mel_pts(n_mel + 2);
+    for (int i = 0; i < n_mel + 2; ++i) {
+        mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
+    }
+
+    // convert to Hz
+    std::vector<double> hz_pts(n_mel + 2);
+    for (int i = 0; i < n_mel + 2; ++i) {
+        hz_pts[i] = mel_to_hz(mel_pts[i]);
+    }
+
+    const int n_fft_bins = n_fft / 2 + 1;
+
+    // filterbank
+    std::vector<float> out(n_mel * n_fft_bins, 0);
+    for (int m = 0; m < n_mel; ++m) {
+        const double f_left   = hz_pts[m];
+        const double f_center = hz_pts[m + 1];
+        const double f_right  = hz_pts[m + 2];
+
+        const double denom_l = std::max(1e-30, f_center - f_left);
+        const double denom_r = std::max(1e-30, f_right - f_center);
+        const double enorm   = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
+
+        for (int k = 0; k < n_fft_bins; ++k) {
+            const double f = k * bin_hz_step;
+            double       w = 0.0;
+            if (f >= f_left && f <= f_center) {
+                w = (f - f_left) / denom_l;
+            } else if (f > f_center && f <= f_right) {
+                w = (f_right - f) / denom_r;
+            }
+            out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
+        }
+    }
+
+    filters.n_mel = n_mel;
+    filters.n_fft = n_fft;
+    filters.data  = std::move(out);
+
+    if (DEBUG) {  // debug
+        for (size_t i = 0; i < filters.data.size(); ++i) {
+            if (filters.data[i] != 0.0f) {
+                printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
+            }
+        }
+    }
+}
+
+// Unified DFT implementation for both forward and inverse transforms
+// Template parameters:
+//   Inverse: false = DFT with exp(-2πi·k·n/N), no scaling
+//            true  = IDFT with exp(+2πi·k·n/N), scales by 1/N
+//   RealInput: true = input is real-valued (stride 1), avoids imaginary computations
+//              false = input is complex-valued (interleaved real/imag, stride 2)
+template <bool Inverse, bool RealInput>
+static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) {
+    const int n_sin_cos_vals = cache.sin_vals.size();
+    const int sin_cos_step   = n_sin_cos_vals / N;
+
+    constexpr float sign  = Inverse ? 1.0f : -1.0f;
+    const float     scale = Inverse ? (1.0f / N) : 1.0f;
+
+    for (int k = 0; k < N; k++) {
+        float re = 0;
+        float im = 0;
+
+        for (int n = 0; n < N; n++) {
+            int   idx     = (k * n * sin_cos_step) % n_sin_cos_vals;
+            float cos_val = cache.cos_vals[idx];
+            float sin_val = cache.sin_vals[idx];
+
+            if constexpr (RealInput) {
+                // Real input: in_im = 0, simplifies to:
+                // re += in_re * cos_val
+                // im += sign * in_re * sin_val
+                float in_re = in[n];
+                re += in_re * cos_val;
+                im += sign * in_re * sin_val;
+            } else {
+                float in_re = in[n * 2 + 0];
+                float in_im = in[n * 2 + 1];
+                // (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i
+                re += in_re * cos_val - sign * in_im * sin_val;
+                im += sign * in_re * sin_val + in_im * cos_val;
+            }
+        }
+
+        out[k * 2 + 0] = re * scale;
+        out[k * 2 + 1] = im * scale;
+    }
+}
+
+// Cooley-Tukey FFT/IFFT unified implementation
+// Template parameters:
+//   Inverse: false = FFT with exp(-2πi·k/N), no scaling
+//            true  = IFFT with exp(+2πi·k/N), scales by 0.5 at each level
+//   RealInput: true = input is real-valued (stride 1)
+//              false = input is complex-valued (interleaved real/imag, stride 2)
+template <bool Inverse, bool RealInput>
+static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
+    const int n_sin_cos_vals = cache.sin_vals.size();
+
+    if (N == 1) {
+        out[0] = in[0];
+        if constexpr (RealInput) {
+            out[1] = 0.0f;
+        } else {
+            out[1] = in[1];
+        }
+        return;
+    }
+
+    const int half_N = N / 2;
+    if (N - half_N * 2 == 1) {
+        // Odd N: fall back to DFT
+        dft_impl<Inverse, RealInput>(cache, in, N, out);
+        return;
+    }
+
+    // Split into even and odd
+    if constexpr (RealInput) {
+        // Real input: stride is 1, copy only real values
+        float * even = in + N;
+        for (int i = 0; i < half_N; ++i) {
+            even[i] = in[2 * i];
+        }
+        float * even_fft = out + 2 * N;
+        fft_impl<Inverse, true>(cache, even, half_N, even_fft);
+
+        float * odd = even;
+        for (int i = 0; i < half_N; ++i) {
+            odd[i] = in[2 * i + 1];
+        }
+        float * odd_fft = even_fft + N;
+        fft_impl<Inverse, true>(cache, odd, half_N, odd_fft);
+    } else {
+        // Complex input: stride is 2, copy complex pairs
+        float * even = in + N * 2;
+        for (int i = 0; i < half_N; ++i) {
+            even[i * 2 + 0] = in[2 * i * 2 + 0];
+            even[i * 2 + 1] = in[2 * i * 2 + 1];
+        }
+        float * even_fft = out + 2 * N;
+        fft_impl<Inverse, false>(cache, even, half_N, even_fft);
+
+        float * odd = even;
+        for (int i = 0; i < half_N; ++i) {
+            odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0];
+            odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1];
+        }
+        float * odd_fft = even_fft + N;
+        fft_impl<Inverse, false>(cache, odd, half_N, odd_fft);
+    }
+
+    float * even_fft = out + 2 * N;
+    float * odd_fft  = even_fft + N;
+
+    const int sin_cos_step = n_sin_cos_vals / N;
+
+    constexpr float sign  = Inverse ? 1.0f : -1.0f;
+    constexpr float scale = Inverse ? 0.5f : 1.0f;
+
+    for (int k = 0; k < half_N; k++) {
+        int   idx = k * sin_cos_step;  // t = 2*M_PI*k/N
+        float re  = cache.cos_vals[idx];
+        float im  = sign * cache.sin_vals[idx];
+
+        float re_odd = odd_fft[2 * k + 0];
+        float im_odd = odd_fft[2 * k + 1];
+
+        out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd);
+        out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd);
+
+        out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd);
+        out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd);
+    }
+}
+
+// Forward FFT for real input (used by mel spectrogram)
+static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
+    fft_impl<false, true>(cache, in, N, out);
+}
+
+// Inverse FFT for complex input
+static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
+    fft_impl<true, false>(cache, in, N, out);
+}
+
+struct filter_params {
+    int32_t n_mel;
+    int32_t n_fft_bins;
+    int32_t hann_window_size;
+    int32_t hop_length;
+    int32_t sample_rate;
+    bool    center_padding = false;
+    float   preemph = 0.f;
+    bool    use_natural_log = false;
+    bool    norm_per_feature = false;
+};
+
+static void log_mel_spectrogram_worker_thread(int                        ith,
+                                              const float *              hann,
+                                              const std::vector<float> & samples,
+                                              int                        n_samples,
+                                              int                        frame_size,
+                                              int                        frame_step,
+                                              int                        n_threads,
+                                              const filter_params &      params,
+                                              const mtmd_audio_cache &   cache,
+                                              mtmd_audio_mel &           out) {
+    std::vector<float> fft_in(frame_size * 2, 0.0);
+    std::vector<float> fft_out(frame_size * 2 * 2 * 2);
+
+    int n_fft_bins = params.n_fft_bins;
+    int i = ith;
+
+    const auto & filters = cache.filters;
+
+    // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
+    GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
+    GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
+    // calculate FFT only when fft_in are not all zero
+    for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
+        const int offset = i * frame_step;
+
+        // apply Hann window (~10% faster)
+        for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
+            fft_in[j] = hann[j] * samples[offset + j];
+        }
+
+        // fill the rest with zeros
+        if (n_samples - offset < frame_size) {
+            std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
+        }
+
+        // FFT
+        fft(cache, fft_in.data(), frame_size, fft_out.data());
+
+        // Calculate modulus^2 of complex numbers
+        // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
+        for (int j = 0; j < n_fft_bins; j++) {
+            fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
+        }
+
+        // mel spectrogram
+        for (int j = 0; j < out.n_mel; j++) {
+            double sum = 0.0;
+            // unroll loop (suggested by GH user @lunixbochs)
+            int k = 0;
+            for (k = 0; k < n_fft_bins - 3; k += 4) {
+                size_t idx = size_t(j) * size_t(n_fft_bins) + size_t(k);
+                sum +=
+                        fft_out[k + 0] * filters.data[idx + 0] +
+                        fft_out[k + 1] * filters.data[idx + 1] +
+                        fft_out[k + 2] * filters.data[idx + 2] +
+                        fft_out[k + 3] * filters.data[idx + 3];
+            }
+            // handle n_fft remainder
+            for (; k < n_fft_bins; k++) {
+                sum += fft_out[k] * filters.data[j * n_fft_bins + k];
+            }
+            sum = params.use_natural_log
+                ? log(sum + 5.960464477539063e-08)
+                : log10(std::max(sum, 1e-10));
+            out.data[j * out.n_len + i] = sum;
+        }
+    }
+
+    // Otherwise fft_out are all zero
+    double sum = params.use_natural_log ? log(1e-10) : log10(1e-10);
+    for (; i < out.n_len; i += n_threads) {
+        for (int j = 0; j < out.n_mel; j++) {
+            out.data[j * out.n_len + i] = sum;
+        }
+    }
+}
+
+// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
+static bool log_mel_spectrogram(
+        const float * samples,
+        const int     n_samples_in,
+        const int     n_threads,
+        const filter_params & params,
+        const mtmd_audio_cache & cache,
+        mtmd_audio_mel & out) {
+    //const int64_t t_start_us = ggml_time_us();
+
+    out.n_len_org = n_samples_in;
+    int n_samples = n_samples_in;
+
+    // Hann window
+    const float * hann       = cache.hann_window.data();
+    const int     frame_size = (params.n_fft_bins - 1) * 2;
+    const int     frame_step = params.hop_length;
+
+    // Padding
+    std::vector<float> samples_padded;
+    if (params.center_padding) {
+        const auto pad_amount = frame_size / 2;
+        samples_padded = std::vector<float>(n_samples + 2 * pad_amount, 0);
+        std::copy(samples, samples + n_samples, samples_padded.data() + pad_amount);
+        samples = samples_padded.data();
+        n_samples = samples_padded.size();
+    } else {
+        // existing padding logic
+        int64_t stage_1_pad = params.sample_rate * 30;
+        int64_t stage_2_pad = frame_size / 2;
+        samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
+        std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
+        // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
+        std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
+        // reflective pad 200 samples at the beginning of audio
+        if (n_samples < stage_2_pad + 1) {
+            // TODO: Handle short audio differently or return error
+            return false;
+        }
+        std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
+    }
+
+    // preemphasis
+    if (params.preemph) {
+        const int   pad_amount = frame_size / 2;
+        const float preemph = 0.97f;
+        float       prev = samples_padded[pad_amount];
+        for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) {
+            float cur = samples_padded[i];
+            samples_padded[i] = cur - preemph * prev;
+            prev = cur;
+        }
+    }
+
+    // pad hann window if it's smaller than frame_size
+    // TODO: probably unnecessary here? (or better doing it in g_cache?)
+    std::vector<float> hann_window_padded;
+    if (params.hann_window_size < frame_size) {
+        hann_window_padded.resize(frame_size);
+        const int padding = (frame_size - params.hann_window_size) / 2;
+        std::copy(hann, hann + params.hann_window_size, &hann_window_padded[padding]);
+        hann = hann_window_padded.data();
+    }
+
+
+    out.n_mel = params.n_mel;
+    out.n_len = (n_samples - frame_size) / frame_step + 1;
+    // TODO: handle these checks better
+    if (out.n_mel > 0 && (unsigned long)out.n_len > SIZE_MAX / out.n_mel) {
+        LOG_ERR("%s: size overflow\n", __func__);
+        return false;
+    }
+    if (n_samples < frame_size) {
+        LOG_ERR("%s: not enough samples after padding\n", __func__);
+        return false;
+    }
+    out.data.resize(out.n_mel * out.n_len);
+
+    {
+        std::vector<std::thread> workers(n_threads - 1);
+        for (int iw = 0; iw < n_threads - 1; ++iw) {
+            workers[iw] =
+                std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples,
+                            frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out));
+        }
+
+        // main thread
+        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params,
+                                          cache, out);
+        for (int iw = 0; iw < n_threads - 1; ++iw) {
+            workers[iw].join();
+        }
+    }
+
+    const int effective_n_len = n_samples_in / frame_step;
+    if (params.norm_per_feature) {
+        for (int i = 0; i < out.n_mel; i++) {
+            double mean = 0;
+            for (int j = 0; j < effective_n_len; ++j) {
+                mean += out.data[i * out.n_len + j];
+            }
+            mean /= effective_n_len;
+
+            double var = 0.0;
+            for (int j = 0; j < effective_n_len; ++j) {
+                const double value = out.data[i * out.n_len + j] - mean;
+                var += value * value;
+            }
+            var /= effective_n_len - 1;  // unbiased
+            const double mstd = std::sqrt(var + 1e-5);
+
+            for (int j = 0; j < effective_n_len; ++j) {
+                auto &value = out.data[i * out.n_len + j];
+                value        = (value - mean) / mstd;
+            }
+
+            // pad the rest with zeros
+            for (int j = effective_n_len; j < out.n_len; ++j) {
+                out.data[i * out.n_len + j] = 0.0;
+            }
+        }
+    } else {
+        // clamping and normalization
+        double mmax = -1e20;
+        for (int i = 0; i < out.n_mel*out.n_len; i++) {
+            if (out.data[i] > mmax) {
+                mmax = out.data[i];
+            }
+        }
+
+        mmax -= 8.0;
+
+        for (int i = 0; i < out.n_mel*out.n_len; i++) {
+            if (out.data[i] < mmax) {
+                out.data[i] = mmax;
+            }
+            out.data[i] = (out.data[i] + 4.0)/4.0;
+        }
+    }
+
+    // Dump log_mel_spectrogram
+    if (DEBUG) {
+        std::ofstream outFile("log_mel_spectrogram.json");
+        outFile << "[";
+        for (uint64_t i = 0; i < out.data.size() - 1; i++) {
+            outFile << out.data[i] << ", ";
+        }
+        outFile << out.data[out.data.size() - 1] << "]";
+        outFile.close();
+    }
+
+    return true;
+}
+
+//
+// mtmd_audio_preprocessor_whisper
+//
+
+void mtmd_audio_preprocessor_whisper::initialize() {
+    cache.fill_sin_cos_table(hparams.audio_n_fft);
+    cache.fill_hann_window(hparams.audio_window_len, true);
+    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
+}
+
+bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 samples,
+                                                 size_t                        n_samples,
+                                                 std::vector<mtmd_audio_mel> & output) {
+    if (n_samples == 0) {
+        // empty audio
+        return false;
+    }
+
+    std::vector<float> smpl;
+    // if input is too short, pad with zeros
+    // this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram
+    // TODO: maybe handle this better
+    size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1);  // +1 second margin
+    if (n_samples < min_samples) {
+        smpl.resize(min_samples, 0.0f);
+        std::memcpy(smpl.data(), samples, n_samples * sizeof(float));
+        samples   = smpl.data();
+        n_samples = smpl.size();
+    }
+
+    filter_params params;
+    params.n_mel            = hparams.n_mel_bins;
+    params.n_fft_bins       = 1 + (hparams.audio_n_fft / 2);
+    params.hann_window_size = hparams.audio_window_len;
+    params.hop_length       = hparams.audio_hop_len;
+    params.sample_rate      = hparams.audio_sample_rate;
+    params.center_padding   = false;
+    params.preemph          = 0.0f;  // disabled
+    params.use_natural_log  = false;
+    params.norm_per_feature = false;
+
+    // make sure the cache is initialized
+    GGML_ASSERT(!cache.sin_vals.empty());
+    GGML_ASSERT(!cache.cos_vals.empty());
+    GGML_ASSERT(!cache.filters.data.empty());
+
+    mtmd_audio_mel out_full;
+    bool           ok = log_mel_spectrogram(samples, n_samples,
+                                            4,  // n_threads
+                                            params, cache, out_full);
+    if (!ok) {
+        return false;
+    }
+
+    // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel
+    // we always expect the mel to have 3000 silent frames at the end
+    if (DEBUG) {
+        printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
+    }
+    const size_t frames_per_chunk = 3000;
+    GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
+    for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
+        int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
+        if ((size_t) n_len < frames_per_chunk) {
+            break;  // last uncomplete chunk will always be a padded chunk, safe to ignore
+        }
+
+        mtmd_audio_mel out_chunk;
+        out_chunk.n_len     = n_len;
+        out_chunk.n_mel     = out_full.n_mel;
+        out_chunk.n_len_org = out_full.n_mel;  // unused
+        out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
+
+        for (int i = 0; i < out_full.n_mel; i++) {
+            auto src = out_full.data.begin() + i * out_full.n_len + off;
+            out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
+        }
+
+        output.push_back(std::move(out_chunk));
+    }
+
+    return true;
+}
+
+//
+// mtmd_audio_preprocessor_conformer
+//
+
+void mtmd_audio_preprocessor_conformer::initialize() {
+    cache.fill_sin_cos_table(hparams.audio_n_fft);
+    cache.fill_hann_window(hparams.audio_window_len, true);
+    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
+}
+
+bool mtmd_audio_preprocessor_conformer::preprocess(const float *                 samples,
+                                                   size_t                        n_samples,
+                                                   std::vector<mtmd_audio_mel> & output) {
+    // empty audio
+    if (n_samples == 0) {
+        return false;
+    }
+
+    filter_params params;
+    params.n_mel            = hparams.n_mel_bins;
+    params.n_fft_bins       = 1 + (hparams.audio_n_fft / 2);
+    params.hann_window_size = hparams.audio_window_len;
+    params.hop_length       = hparams.audio_hop_len;
+    params.sample_rate      = hparams.audio_sample_rate;
+    params.center_padding   = true;
+    params.preemph          = 0.97f;
+    params.use_natural_log  = true;
+    params.norm_per_feature = true;
+
+    // make sure the cache is initialized
+    GGML_ASSERT(!cache.sin_vals.empty());
+    GGML_ASSERT(!cache.cos_vals.empty());
+    GGML_ASSERT(!cache.filters.data.empty());
+
+    mtmd_audio_mel out_full;
+    bool           ok = log_mel_spectrogram(samples, n_samples,
+                                            4,  // n_threads
+                                            params, cache, out_full);
+    if (!ok) {
+        return false;
+    }
+
+    output.push_back(std::move(out_full));
+    return true;
+}
+
+//
+// mtmd_audio_streaming_istft implementation
+//
+
+mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) :
+    n_fft(n_fft),
+    hop_length(hop_length),
+    n_fft_bins(n_fft / 2 + 1),
+    overlap_buffer(n_fft, 0.0f),
+    window_sum_buffer(n_fft, 0.0f),
+    padding_to_remove((n_fft - hop_length) / 2),
+    ifft_in(n_fft * 2 * 4, 0.0f),  // extra space for recursive IFFT
+    ifft_out(n_fft * 2 * 4, 0.0f) {
+    cache.fill_sin_cos_table(n_fft);
+    cache.fill_hann_window(n_fft, true);
+}
+
+void mtmd_audio_streaming_istft::reset() {
+    std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f);
+    std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f);
+    padding_to_remove = (n_fft - hop_length) / 2;
+}
+
+std::vector<float> mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) {
+    std::vector<float> output(hop_length);
+
+    // copy frequencies
+    for (int j = 0; j < n_fft_bins; j++) {
+        ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0];
+        ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1];
+    }
+
+    // mirror negative frequencies
+    for (int j = 1; j < n_fft_bins - 1; j++) {
+        int mirror_idx              = n_fft - j;
+        ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0];
+        ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1];  // conjugate
+    }
+
+    ifft(cache, ifft_in.data(), n_fft, ifft_out.data());
+
+    // update window sum and overlap buffer
+    for (int j = 0; j < n_fft; j++) {
+        window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j];
+        overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j];
+    }
+
+    // extract hop_length samples with normalization
+    for (int i = 0; i < hop_length; i++) {
+        if (window_sum_buffer[i] > 1e-8f) {
+            output[i] = overlap_buffer[i] / window_sum_buffer[i];
+        } else {
+            output[i] = overlap_buffer[i];
+        }
+    }
+
+    // shift buffers left by hop_length
+    std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin());
+    std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f);
+
+    std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin());
+    std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f);
+
+    // Remove padding if needed
+    int to_remove = std::min(padding_to_remove, (int) output.size());
+    padding_to_remove -= to_remove;
+    output.erase(output.begin(), output.begin() + to_remove);
+
+    return output;
+}
+
+std::vector<float> mtmd_audio_streaming_istft::flush() {
+    std::vector<float> output;
+
+    // Extract remaining samples from overlap buffer
+    // Continue until we've extracted all meaningful samples
+    int remaining = n_fft - hop_length;
+    while (remaining > 0) {
+        int chunk_size = std::min(remaining, hop_length);
+
+        for (int i = 0; i < chunk_size; i++) {
+            float sample;
+            if (window_sum_buffer[i] > 1e-8f) {
+                sample = overlap_buffer[i] / window_sum_buffer[i];
+            } else {
+                sample = overlap_buffer[i];
+            }
+            output.push_back(sample);
+        }
+
+        // Shift buffers
+        std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin());
+        std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f);
+
+        std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin());
+        std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f);
+
+        remaining -= chunk_size;
+    }
+
+    return output;
+}
diff --git a/llama.cpp/tools/mtmd/mtmd-audio.h b/llama.cpp/tools/mtmd/mtmd-audio.h
new file mode 100644
index 0000000..016c739
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd-audio.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include "ggml.h"
+#include "clip-model.h"
+
+#include <cstdint>
+#include <vector>
+#include <string>
+
+#define MTMD_INTERNAL_HEADER
+
+struct mtmd_audio_mel {
+    int n_len;
+    int n_len_org;
+    int n_mel;
+
+    std::vector<float> data;
+};
+
+struct mtmd_audio_mel_filters {
+    int32_t n_mel;
+    int32_t n_fft;
+
+    std::vector<float> data;
+};
+
+// cache for audio processing, each processor instance owns its own cache
+struct mtmd_audio_cache {
+    std::vector<float> sin_vals;
+    std::vector<float> cos_vals;
+
+    std::vector<float> hann_window;
+
+    mtmd_audio_mel_filters filters;
+
+    void fill_sin_cos_table(int n);
+
+    void fill_hann_window(int length, bool periodic);
+
+    // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
+    // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
+    void fill_mel_filterbank_matrix(int   n_mel,
+                                    int   n_fft,
+                                    int   sample_rate,               // e.g. 16000
+                                    float fmin             = 0.0f,   // e.g. 0.0
+                                    float fmax             = -1.0f,  // e.g. sr/2; pass -1 for auto
+                                    bool  slaney_area_norm = true,
+                                    float scale = 1.0f  // optional extra scaling
+    );
+};
+
+struct mtmd_audio_preprocessor {
+    const clip_hparams & hparams;
+
+    mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
+
+    virtual ~mtmd_audio_preprocessor() = default;
+    virtual void initialize() = 0; // NOT thread-safe
+    virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
+};
+
+struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
+    mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
+    void initialize() override;
+    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
+
+  private:
+    mtmd_audio_cache cache;
+};
+
+struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
+    mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
+    void initialize() override;
+    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
+
+  private:
+    mtmd_audio_cache cache;
+};
+
+//
+// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
+//
+struct mtmd_audio_streaming_istft {
+    mtmd_audio_streaming_istft(int n_fft, int hop_length);
+
+    // reset streaming state
+    void reset();
+
+    // process a single STFT frame (streaming)
+    // frame_spectrum: [n_fft_bins x 2] interleaved real/imag
+    // returns: up to hop_length samples
+    std::vector<float> process_frame(const float * frame_spectrum);
+
+    // flush remaining samples at end of stream
+    std::vector<float> flush();
+
+  private:
+    int n_fft;
+    int hop_length;
+    int n_fft_bins;
+
+    // Own cache for output processing
+    mtmd_audio_cache cache;
+
+    // Streaming state
+    std::vector<float> overlap_buffer;
+    std::vector<float> window_sum_buffer;
+    int                padding_to_remove;
+
+    // Working buffers for IFFT
+    std::vector<float> ifft_in;
+    std::vector<float> ifft_out;
+};
diff --git a/llama.cpp/tools/mtmd/mtmd-cli.cpp b/llama.cpp/tools/mtmd/mtmd-cli.cpp
new file mode 100644
index 0000000..054c7fa
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd-cli.cpp
@@ -0,0 +1,437 @@
+#include "arg.h"
+#include "debug.h"
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+#include "llama.h"
+#include "ggml.h"
+#include "console.h"
+#include "chat.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
+
+#include <vector>
+#include <limits.h>
+#include <cinttypes>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <signal.h>
+#endif
+
+// volatile, because of signal being an interrupt
+static volatile bool g_is_generating = false;
+static volatile bool g_is_interrupted = false;
+
+/**
+ * Please note that this is NOT a production-ready stuff.
+ * It is a playground for trying multimodal support in llama.cpp.
+ * For contributors: please keep this code simple and easy to understand.
+ */
+
+static void show_additional_info(int /*argc*/, char ** argv) {
+    LOG(
+        "Experimental CLI for multimodal\n\n"
+        "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> --audio <audio> -p <prompt>\n\n"
+        "  -m and --mmproj are required\n"
+        "  -hf user/repo can replace both -m and --mmproj in most cases\n"
+        "  --image, --audio and -p are optional, if NOT provided, the CLI will run in chat mode\n"
+        "  to disable using GPU for mmproj model, add --no-mmproj-offload\n",
+        argv[0]
+    );
+}
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+static void sigint_handler(int signo) {
+    if (signo == SIGINT) {
+        if (g_is_generating) {
+            g_is_generating = false;
+        } else {
+            console::cleanup();
+            if (g_is_interrupted) {
+                _exit(1);
+            }
+            g_is_interrupted = true;
+        }
+    }
+}
+#endif
+
+struct mtmd_cli_context {
+    mtmd::context_ptr ctx_vision;
+    common_init_result_ptr llama_init;
+
+    llama_model       * model;
+    llama_context     * lctx;
+    const llama_vocab * vocab;
+    common_sampler    * smpl;
+    llama_batch         batch;
+    int                 n_batch;
+
+    mtmd::bitmaps bitmaps;
+
+    // chat template
+    common_chat_templates_ptr tmpls;
+    std::vector<common_chat_msg> chat_history;
+    bool use_jinja = false;
+    // TODO: support for --system-prompt with /clear command
+
+    // support for legacy templates (models not having EOT token)
+    llama_tokens antiprompt_tokens;
+
+    int n_threads    = 1;
+    llama_pos n_past = 0;
+
+    base_callback_data cb_data;
+
+    mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
+        model = llama_init->model();
+        lctx = llama_init->context();
+        vocab = llama_model_get_vocab(model);
+        smpl = common_sampler_init(model, params.sampling);
+        n_threads = params.cpuparams.n_threads;
+        batch = llama_batch_init(1, 0, 1); // batch for next token generation
+        n_batch = params.n_batch;
+
+        if (!model || !lctx) {
+            exit(1);
+        }
+
+        if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
+            LOG_ERR("Model does not have chat template.\n");
+            LOG_ERR("  For old llava models, you may need to use '--chat-template vicuna'\n");
+            LOG_ERR("  For MobileVLM models, use '--chat-template deepseek'\n");
+            LOG_ERR("  For Mistral Small 3.1, use '--chat-template mistral-v7'\n");
+            exit(1);
+        }
+
+        tmpls = common_chat_templates_init(model, params.chat_template);
+        use_jinja = params.use_jinja;
+        chat_history.clear();
+        LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
+
+        init_vision_context(params);
+
+        // load antiprompt tokens for legacy templates
+        if (params.chat_template == "vicuna") {
+            antiprompt_tokens = common_tokenize(lctx, "ASSISTANT:", false, true);
+        } else if (params.chat_template == "deepseek") {
+            antiprompt_tokens = common_tokenize(lctx, "###", false, true);
+        }
+    }
+
+    ~mtmd_cli_context() {
+        llama_batch_free(batch);
+        common_sampler_free(smpl);
+    }
+
+    void init_vision_context(common_params & params) {
+        const char * clip_path = params.mmproj.path.c_str();
+        mtmd_context_params mparams = mtmd_context_params_default();
+        mparams.use_gpu          = params.mmproj_use_gpu;
+        mparams.print_timings    = true;
+        mparams.n_threads        = params.cpuparams.n_threads;
+        mparams.flash_attn_type  = params.flash_attn_type;
+        mparams.warmup           = params.warmup;
+        mparams.image_min_tokens = params.image_min_tokens;
+        mparams.image_max_tokens = params.image_max_tokens;
+        if (std::getenv("MTMD_DEBUG_GRAPH") != nullptr) {
+            mparams.cb_eval_user_data = &cb_data;
+            mparams.cb_eval = common_debug_cb_eval<false>;
+        }
+        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
+        if (!ctx_vision.get()) {
+            LOG_ERR("Failed to load vision model from %s\n", clip_path);
+            exit(1);
+        }
+    }
+
+    bool check_antiprompt(const llama_tokens & generated_tokens) {
+        if (antiprompt_tokens.empty() || generated_tokens.size() < antiprompt_tokens.size()) {
+            return false;
+        }
+        return std::equal(
+            generated_tokens.end() - antiprompt_tokens.size(),
+            generated_tokens.end(),
+            antiprompt_tokens.begin()
+        );
+    }
+
+    bool load_media(const std::string & fname) {
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
+        if (!bmp.ptr) {
+            return false;
+        }
+        bitmaps.entries.push_back(std::move(bmp));
+        return true;
+    }
+};
+
+static int generate_response(mtmd_cli_context & ctx, int n_predict) {
+    llama_tokens generated_tokens;
+    for (int i = 0; i < n_predict; i++) {
+        if (i > n_predict || !g_is_generating || g_is_interrupted) {
+            LOG("\n");
+            break;
+        }
+
+        llama_token token_id = common_sampler_sample(ctx.smpl, ctx.lctx, -1);
+        generated_tokens.push_back(token_id);
+        common_sampler_accept(ctx.smpl, token_id, true);
+
+        if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
+            LOG("\n");
+            break; // end of generation
+        }
+
+        LOG("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
+        fflush(stdout);
+
+        if (g_is_interrupted) {
+            LOG("\n");
+            break;
+        }
+
+        // eval the token
+        common_batch_clear(ctx.batch);
+        common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
+        if (llama_decode(ctx.lctx, ctx.batch)) {
+            LOG_ERR("failed to decode token\n");
+            return 1;
+        }
+    }
+
+    std::string generated_text = common_detokenize(ctx.lctx, generated_tokens);
+    common_chat_msg msg;
+    msg.role    = "assistant";
+    msg.content = generated_text;
+    ctx.chat_history.push_back(std::move(msg));
+
+    return 0;
+}
+
+static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) {
+    LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n",
+        new_msg.role.c_str(), new_msg.content.c_str());
+    auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history,
+        new_msg, new_msg.role == "user",
+        ctx.use_jinja);
+    ctx.chat_history.push_back(new_msg);
+    return formatted;
+}
+
+static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
+    bool add_bos = ctx.chat_history.empty();
+    auto formatted_chat = chat_add_and_format(ctx, msg);
+    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
+
+    mtmd_input_text text;
+    text.text          = formatted_chat.c_str();
+    text.add_special   = add_bos;
+    text.parse_special = true;
+
+    if (g_is_interrupted) return 0;
+
+    mtmd::input_chunks chunks(mtmd_input_chunks_init());
+    auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
+    int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
+                        chunks.ptr.get(), // output
+                        &text, // text
+                        bitmaps_c_ptr.data(),
+                        bitmaps_c_ptr.size());
+    if (res != 0) {
+        LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
+        return 1;
+    }
+
+    ctx.bitmaps.entries.clear();
+
+    llama_pos new_n_past;
+    if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
+                ctx.lctx, // lctx
+                chunks.ptr.get(), // chunks
+                ctx.n_past, // n_past
+                0, // seq_id
+                ctx.n_batch, // n_batch
+                true, // logits_last
+                &new_n_past)) {
+        LOG_ERR("Unable to eval prompt\n");
+        return 1;
+    }
+
+    ctx.n_past = new_n_past;
+
+    LOG("\n");
+
+    return 0;
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
+        return 1;
+    }
+
+    common_init();
+    mtmd_helper_log_set(common_log_default_callback, nullptr);
+
+    if (params.mmproj.path.empty()) {
+        show_additional_info(argc, argv);
+        LOG_ERR("ERR: Missing --mmproj argument\n");
+        return 1;
+    }
+
+    mtmd_cli_context ctx(params);
+    LOG_INF("%s: loading model: %s\n", __func__, params.model.path.c_str());
+
+    bool is_single_turn = !params.prompt.empty() && !params.image.empty();
+
+    int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
+
+    // Ctrl+C handling
+    {
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+        struct sigaction sigint_action;
+        sigint_action.sa_handler = sigint_handler;
+        sigemptyset (&sigint_action.sa_mask);
+        sigint_action.sa_flags = 0;
+        sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+        };
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+    }
+
+    if (g_is_interrupted) return 130;
+
+    auto eval_system_prompt_if_present = [&] {
+        if (params.system_prompt.empty()) {
+            return 0;
+        }
+
+        common_chat_msg msg;
+        msg.role = "system";
+        msg.content = params.system_prompt;
+        return eval_message(ctx, msg);
+    };
+
+    LOG_WRN("WARN: This is an experimental CLI for testing multimodal capability.\n");
+    LOG_WRN("      For normal use cases, please use the standard llama-cli\n");
+
+    if (eval_system_prompt_if_present()) {
+        return 1;
+    }
+
+    if (is_single_turn) {
+        g_is_generating = true;
+        if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
+            for (size_t i = 0; i < params.image.size(); i++) {
+                // most models require the marker before each image
+                // ref: https://github.com/ggml-org/llama.cpp/pull/17616
+                params.prompt = mtmd_default_marker() + params.prompt;
+            }
+        }
+
+        common_chat_msg msg;
+        msg.role = "user";
+        msg.content = params.prompt;
+        for (const auto & image : params.image) {
+            if (!ctx.load_media(image)) {
+                return 1; // error is already printed by libmtmd
+            }
+        }
+        if (eval_message(ctx, msg)) {
+            return 1;
+        }
+        if (!g_is_interrupted && generate_response(ctx, n_predict)) {
+            return 1;
+        }
+
+    } else {
+        LOG("\n Running in chat mode, available commands:");
+        if (mtmd_support_vision(ctx.ctx_vision.get())) {
+            LOG("\n   /image <path>    load an image");
+        }
+        if (mtmd_support_audio(ctx.ctx_vision.get())) {
+            LOG("\n   /audio <path>    load an audio");
+        }
+        LOG("\n   /clear           clear the chat history");
+        LOG("\n   /quit or /exit   exit the program");
+        LOG("\n");
+
+        std::string content;
+
+        while (!g_is_interrupted) {
+            g_is_generating = false;
+            LOG("\n> ");
+            console::set_display(DISPLAY_TYPE_USER_INPUT);
+            std::string line;
+            console::readline(line, false);
+            if (g_is_interrupted) break;
+            console::set_display(DISPLAY_TYPE_RESET);
+            line = string_strip(line);
+            if (line.empty()) {
+                continue;
+            }
+            if (line == "/quit" || line == "/exit") {
+                break;
+            }
+            if (line == "/clear") {
+                ctx.n_past = 0;
+                ctx.chat_history.clear();
+                llama_memory_clear(llama_get_memory(ctx.lctx), true);
+                if (eval_system_prompt_if_present()) {
+                    return 1;
+                }
+                LOG("Chat history cleared\n\n");
+                continue;
+            }
+            g_is_generating = true;
+            bool is_image = line == "/image" || line.find("/image ") == 0;
+            bool is_audio = line == "/audio" || line.find("/audio ") == 0;
+            if (is_image || is_audio) {
+                if (line.size() < 8) {
+                    LOG_ERR("ERR: Missing media filename\n");
+                    continue;
+                }
+                std::string media_path = line.substr(7);
+                if (ctx.load_media(media_path)) {
+                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
+                    content += mtmd_default_marker();
+                }
+                // else, error is already printed by libmtmd
+                continue;
+            } else {
+                content += line;
+            }
+            common_chat_msg msg;
+            msg.role = "user";
+            msg.content = content;
+            int ret = eval_message(ctx, msg);
+            if (ret) {
+                return 1;
+            }
+            if (g_is_interrupted) break;
+            if (generate_response(ctx, n_predict)) {
+                return 1;
+            }
+            content.clear();
+        }
+    }
+    if (g_is_interrupted) LOG("\nInterrupted by user\n");
+    LOG("\n\n");
+    llama_perf_context_print(ctx.lctx);
+    return g_is_interrupted ? 130 : 0;
+}
diff --git a/llama.cpp/tools/mtmd/mtmd-helper.cpp b/llama.cpp/tools/mtmd/mtmd-helper.cpp
new file mode 100644
index 0000000..902a4b4
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd-helper.cpp
@@ -0,0 +1,521 @@
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#include "mtmd.h"
+#include "mtmd-helper.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <vector>
+
+//#define MTMD_AUDIO_DEBUG
+
+#define MINIAUDIO_IMPLEMENTATION
+#ifndef MTMD_AUDIO_DEBUG
+#   define MA_NO_ENCODING
+#endif
+#define MA_NO_DEVICE_IO
+#define MA_NO_RESOURCE_MANAGER
+#define MA_NO_NODE_GRAPH
+#define MA_NO_ENGINE
+#define MA_NO_GENERATION
+#define MA_API static
+#include "miniaudio/miniaudio.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb/stb_image.h"
+
+#ifdef MTMD_INTERNAL_HEADER
+#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
+#endif
+
+//
+// internal logging functions
+//
+
+struct mtmd_helper_logger {
+    ggml_log_callback default_callback = [](ggml_log_level level, const char * text, void * user_data) {
+        (void) level;
+        (void) user_data;
+        fputs(text, stderr);
+        fflush(stderr);
+    };
+
+    ggml_log_callback log_callback = default_callback;
+    void * log_callback_user_data;
+
+    void log_v(enum ggml_log_level level, const char * format, va_list args) {
+        if (format == NULL) {
+            return;
+        }
+        va_list args_copy;
+        va_copy(args_copy, args);
+        char buffer[128];
+        int len = vsnprintf(buffer, 128, format, args);
+        if (len < 128) {
+            log_callback(level, buffer, log_callback_user_data);
+        } else {
+            char * buffer2 = (char *) calloc(len + 1, sizeof(char));
+            vsnprintf(buffer2, len + 1, format, args_copy);
+            buffer2[len] = 0;
+            log_callback(level, buffer2, log_callback_user_data);
+            free(buffer2);
+        }
+        va_end(args_copy);
+    }
+
+    void log(enum ggml_log_level level, const char * format, ...) {
+        va_list args;
+        va_start(args, format);
+        log_v(level, format, args);
+        va_end(args);
+    }
+} g_logger;
+
+#define LOG_INF(...) g_logger.log(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
+#define LOG_WRN(...) g_logger.log(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
+#define LOG_ERR(...) g_logger.log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+
+void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data) {
+    if (log_callback == nullptr) {
+        log_callback = g_logger.default_callback;
+    }
+    g_logger.log_callback = log_callback;
+    g_logger.log_callback_user_data = user_data;
+    mtmd_log_set(log_callback, user_data);
+}
+
+//
+// helper functions
+//
+
+size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
+    size_t n_tokens = 0;
+    for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
+        auto chunk = mtmd_input_chunks_get(chunks, i);
+        n_tokens += mtmd_input_chunk_get_n_tokens(chunk);
+    }
+    return n_tokens;
+}
+
+llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
+    llama_pos n_pos = 0;
+    for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
+        auto chunk = mtmd_input_chunks_get(chunks, i);
+        n_pos += mtmd_input_chunk_get_n_pos(chunk);
+    }
+    return n_pos;
+}
+
+// helper struct to make working with embd batch easier
+// note: this will be removed after llama_batch_ext refactoring
+struct decode_embd_batch {
+    int n_pos_per_embd;
+    int n_mmproj_embd;
+    std::vector<llama_pos>      pos;
+    std::vector<llama_pos>      pos_view; // used by mrope
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id>   seq_id_0;
+    std::vector<llama_seq_id *> seq_ids;
+    std::vector<int8_t>         logits;
+    llama_batch batch;
+    decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+        pos     .resize(n_tokens * n_pos_per_embd);
+        n_seq_id.resize(n_tokens);
+        seq_ids .resize(n_tokens + 1);
+        logits  .resize(n_tokens);
+        seq_id_0.resize(1);
+        seq_ids [n_tokens] = nullptr;
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ embd,
+            /*pos            =*/ pos.data(),
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+    }
+
+    void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
+        seq_id_0[0] = seq_id;
+        for (int i = 0; i < batch.n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+
+    // M-RoPE for image
+    void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
+        GGML_ASSERT(n_pos_per_embd == 4);
+        seq_id_0[0] = seq_id;
+        for (int y = 0; y < ny; y++) {
+            for (int x = 0; x < nx; x++) {
+                int i = y * nx + x;
+                pos[i                     ] = pos_0;
+                pos[i + batch.n_tokens    ] = pos_0 + y;
+                pos[i + batch.n_tokens * 2] = pos_0 + x;
+                pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+            }
+        }
+        for (int i = 0; i < batch.n_tokens; i++) {
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+
+    // M-RoPE for audio
+    void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
+        GGML_ASSERT(n_pos_per_embd == 4);
+        seq_id_0[0] = seq_id;
+        for (int i = 0; i < batch.n_tokens; i++) {
+            pos[i                     ] = pos_0 + i;
+            pos[i + batch.n_tokens    ] = pos_0 + i;
+            pos[i + batch.n_tokens * 2] = pos_0 + i;
+            pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+        }
+        for (int i = 0; i < batch.n_tokens; i++) {
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+
+    llama_batch get_view(int offset, int n_tokens) {
+        llama_pos * pos_ptr;
+        pos_view.clear();
+        pos_view.reserve(n_tokens * n_pos_per_embd);
+        if (n_pos_per_embd > 1) {
+            // mrope
+            // for example, with layout of src: 1234...1234...1234...1234...
+            //       offset 2 will give us dst: 34...34...34...34...
+            for (int i = 0; i < n_pos_per_embd; i++) {
+                // assume n_tokens is less than or equal to batch.n_tokens
+                // batch.n_tokens is number of **total** tokens
+                // n_tokens is number of viewed token
+                size_t src_idx = i * batch.n_tokens + offset;
+                pos_view.insert(pos_view.end(),
+                    pos.data() + src_idx,
+                    pos.data() + src_idx + n_tokens);
+            }
+            pos_ptr = pos_view.data();
+        } else {
+            // normal
+            pos_ptr = pos.data() + offset;
+        }
+        return {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ batch.embd     + offset * n_mmproj_embd,
+            /*pos            =*/ pos_ptr,
+            /*n_seq_id       =*/ batch.n_seq_id + offset,
+            /*seq_id         =*/ batch.seq_id   + offset,
+            /*logits         =*/ batch.logits   + offset,
+        };
+    }
+};
+
+// Helper function for decoding an image whose embeddings have already been calculated
+int32_t mtmd_helper_decode_image_chunk(
+        mtmd_context * ctx,
+        struct llama_context * lctx,
+        const mtmd_input_chunk * chunk,
+        float * encoded_embd,
+        llama_pos n_past,
+        llama_seq_id seq_id,
+        int32_t n_batch,
+        llama_pos * new_n_past) {
+    auto chunk_type = mtmd_input_chunk_get_type(chunk);
+    const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
+    if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        LOG_ERR("failed to decode chunk: input chunk not of image/audio type\n");
+        return -1;
+    }
+
+    const llama_model * model = llama_get_model(lctx);
+    int n_mmproj_embd = llama_model_n_embd_inp(model);
+    int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
+
+    int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
+    int32_t i_batch = 0;
+    int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
+    decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+
+    if (mtmd_decode_use_mrope(ctx)) {
+        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+            if (!image_tokens) {
+                LOG_ERR("failed to decode chunk: image tokens are null\n");
+                return -1;
+            }
+            const int nx = mtmd_image_tokens_get_nx(image_tokens);
+            const int ny = mtmd_image_tokens_get_ny(image_tokens);
+            batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
+        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+            batch_embd.set_position_mrope_1d(n_past, seq_id);
+        } else {
+            GGML_ABORT("invalid chunk type for M-RoPE");
+        }
+    } else {
+        batch_embd.set_position_normal(n_past, seq_id);
+    }
+
+    if (mtmd_decode_use_non_causal(ctx)) {
+        llama_set_causal_attn(lctx, false);
+        // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
+    }
+
+    while (i_batch < n_img_batches) { // split into batches
+        int pos_offset = i_batch*n_batch;
+        int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
+        llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
+
+        LOG_INF("decoding %s batch %d/%d, n_tokens_batch = %d\n", name, i_batch+1, n_img_batches, n_tokens_batch);
+
+        int64_t t1 = ggml_time_ms();
+        int32_t ret = llama_decode(lctx, batch_embd_view);
+        if (ret != 0) {
+            LOG_ERR("failed to decode %s\n", name);
+            llama_set_causal_attn(lctx, true); // restore causal attn
+            return ret;
+        }
+
+        LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1);
+
+        i_batch++;
+    }
+
+    n_past += mtmd_input_chunk_get_n_pos(chunk);
+    *new_n_past = n_past;
+
+    if (mtmd_decode_use_non_causal(ctx)) {
+        llama_set_causal_attn(lctx, true);
+    }
+    return 0;
+}
+
+int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+        struct llama_context * lctx,
+        const mtmd_input_chunk * chunk,
+        llama_pos n_past,
+        llama_seq_id seq_id,
+        int32_t n_batch,
+        bool logits_last,
+        llama_pos * new_n_past) {
+    int32_t ret;
+    llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
+    auto chunk_type = mtmd_input_chunk_get_type(chunk);
+
+    if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        size_t n_tokens;
+        const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+        // LOG_INF("decoding text chunk, n_tokens = %zu\n", n_tokens);
+        size_t i = 0;
+        while (i < n_tokens) { // split into batches
+            text_batch.n_tokens = 0; // clear the batch
+            for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
+                int32_t j = text_batch.n_tokens;
+                text_batch.token   [j]    = tokens[i];
+                text_batch.pos     [j]    = n_past++;
+                text_batch.n_seq_id[j]    = 1;
+                text_batch.seq_id  [j][0] = seq_id;
+                text_batch.logits  [j]    = false;
+
+                text_batch.n_tokens++;
+            }
+            bool is_last_token = (i == n_tokens);
+            if (logits_last && is_last_token) {
+                text_batch.logits[text_batch.n_tokens - 1] = true;
+            }
+            ret = llama_decode(lctx, text_batch);
+            if (ret != 0) {
+                LOG_ERR("failed to decode text\n");
+                llama_batch_free(text_batch);
+                return ret;
+            }
+            *new_n_past += text_batch.n_tokens;
+        }
+
+    } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
+        int64_t t0 = ggml_time_ms();
+
+        LOG_INF("encoding %s slice...\n", name);
+
+        ret = mtmd_encode_chunk(ctx, chunk);
+        if (ret != 0) {
+            LOG_ERR("failed to encode %s slice\n", name);
+            llama_batch_free(text_batch);
+            return ret;
+        }
+
+        LOG_INF("%s slice encoded in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
+
+        float * embd = mtmd_get_output_embd(ctx);
+        ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
+        if (ret != 0) {
+            LOG_ERR("failed to decode %s\n", name);
+            llama_batch_free(text_batch);
+            return ret;
+        }
+    } else {
+        GGML_ABORT("chunk type not supported");
+    }
+
+    llama_batch_free(text_batch);
+    return 0;
+}
+
+int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
+                                struct llama_context * lctx,
+                                const mtmd_input_chunks * chunks,
+                                llama_pos n_past,
+                                llama_seq_id seq_id,
+                                int32_t n_batch,
+                                bool logits_last,
+                                llama_pos * new_n_past) {
+    size_t n_chunks = mtmd_input_chunks_size(chunks);
+    if (n_chunks == 0) {
+        LOG_WRN("no chunks to eval\n");
+        return 0;
+    }
+
+    for (size_t i = 0; i < n_chunks; i++) {
+        bool chunk_logits_last = (i == n_chunks - 1) && logits_last;
+        auto chunk = mtmd_input_chunks_get(chunks, i);
+
+        int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past);
+        if (res != 0) {
+            LOG_ERR("failed to eval chunk %zu\n", i);
+            return res;
+        }
+        *new_n_past = n_past;
+    }
+
+    return 0;
+}
+
+namespace audio_helpers {
+
+static bool is_audio_file(const char * buf, size_t len) {
+    if (len < 12) {
+        return false;
+    }
+
+    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
+    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
+    bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
+    bool is_mp3 = len >= 3 && (
+        memcmp(buf, "ID3", 3) == 0 ||
+        // Check for MPEG sync word (simplified check)
+        ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
+    );
+    bool is_flac = memcmp(buf, "fLaC", 4) == 0;
+
+    return is_wav || is_mp3 || is_flac;
+}
+
+// returns true if the buffer is a valid audio file
+static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
+    ma_result result;
+    const int channels = 1;
+    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
+    ma_decoder decoder;
+
+    result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
+    if (result != MA_SUCCESS) {
+        return false;
+    }
+
+    ma_uint64 frame_count;
+    ma_uint64 frames_read;
+    result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
+    if (result != MA_SUCCESS) {
+        ma_decoder_uninit(&decoder);
+        return false;
+    }
+
+    pcmf32_mono.resize(frame_count);
+    result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
+    if (result != MA_SUCCESS) {
+        ma_decoder_uninit(&decoder);
+        return false;
+    }
+
+#ifdef MTMD_AUDIO_DEBUG
+    // save audio to wav file
+    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
+    ma_encoder encoder;
+    ma_encoder_init_file("output.wav", &config, &encoder);
+    ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
+    ma_encoder_uninit(&encoder);
+#endif
+
+    ma_decoder_uninit(&decoder);
+    return true;
+}
+
+} // namespace audio_helpers
+
+mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
+    if (audio_helpers::is_audio_file((const char *)buf, len)) {
+        std::vector<float> pcmf32;
+        int bitrate = mtmd_get_audio_bitrate(ctx);
+        if (bitrate < 0) {
+            LOG_ERR("This model does not support audio input\n");
+            return nullptr;
+        }
+        if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
+            LOG_ERR("Unable to read WAV audio file from buffer\n");
+            return nullptr;
+        }
+        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
+    }
+
+    // otherwise, we assume it's an image
+    mtmd_bitmap * result = nullptr;
+    {
+        int nx, ny, nc;
+        auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
+        if (!data) {
+            LOG_ERR("%s: failed to decode image bytes\n", __func__);
+            return nullptr;
+        }
+        result = mtmd_bitmap_init(nx, ny, data);
+        stbi_image_free(data);
+    }
+    return result;
+}
+
+mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
+    std::vector<unsigned char> buf;
+    FILE * f = fopen(fname, "rb");
+    if (!f) {
+        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
+        return nullptr;
+    }
+
+    fseek(f, 0, SEEK_END);
+    long file_size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    buf.resize(file_size);
+
+    size_t n_read = fread(buf.data(), 1, file_size, f);
+    fclose(f);
+    if (n_read != (size_t)file_size) {
+        LOG_ERR("Failed to read entire file %s", fname);
+        return nullptr;
+    }
+
+    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
+}
diff --git a/llama.cpp/tools/mtmd/mtmd-helper.h b/llama.cpp/tools/mtmd/mtmd-helper.h
new file mode 100644
index 0000000..5036b92
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd-helper.h
@@ -0,0 +1,96 @@
+#ifndef MTMD_HELPER_H
+#define MTMD_HELPER_H
+
+#include "ggml.h"
+#include "llama.h"
+#include "mtmd.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// libmtmd helper functions
+//
+// Please note that these helpers are not guaranteed to be stable.
+// BREAKING CHANGES are expected.
+//
+
+// Set callback for all future logging events.
+// If this is not called, or NULL is supplied, everything is output on stderr.
+// Note: this also call mtmd_log_set() internally
+MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data);
+
+// helper function to construct a mtmd_bitmap from a file
+// it calls mtmd_helper_bitmap_init_from_buf() internally
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
+
+// helper function to construct a mtmd_bitmap from a buffer containing a file
+// supported formats:
+//     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
+//     audio: formats supported by miniaudio: wav, mp3, flac
+// note: audio files will be auto-detected based on magic bytes
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
+
+// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
+MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
+
+// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
+// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
+MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
+
+// helper function that automatically:
+// 1. run llama_decode() on text chunks
+// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
+// otherwise, returns 0 on success
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
+                                         struct llama_context * lctx,
+                                         const mtmd_input_chunks * chunks,
+                                         llama_pos n_past,
+                                         llama_seq_id seq_id,
+                                         int32_t n_batch,
+                                         bool logits_last,
+                                         llama_pos * new_n_past);
+
+// works like mtmd_helper_eval_chunks(), but only for a single chunk
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+                                               struct llama_context * lctx,
+                                               const mtmd_input_chunk * chunk,
+                                               llama_pos n_past,
+                                               llama_seq_id seq_id,
+                                               int32_t n_batch,
+                                               bool logits_last,
+                                               llama_pos * new_n_past);
+
+// helper function to decode an image whose embeddings have already been calculated
+// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
+// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
+MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
+                                                struct llama_context * lctx,
+                                                const mtmd_input_chunk * chunk,
+                                                float * encoded_embd,
+                                                llama_pos n_past,
+                                                llama_seq_id seq_id,
+                                                int32_t n_batch,
+                                                llama_pos * new_n_past);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+//
+// C++ wrappers
+//
+
+#endif
diff --git a/llama.cpp/tools/mtmd/mtmd.cpp b/llama.cpp/tools/mtmd/mtmd.cpp
new file mode 100644
index 0000000..b763627
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd.cpp
@@ -0,0 +1,1151 @@
+#include "clip.h"
+#include "clip-impl.h"
+#include "mtmd.h"
+#include "mtmd-audio.h"
+
+#include "llama.h"
+
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+// represents raw image data, layout is RGBRGBRGB...
+// length of data must be nx * ny * 3
+struct mtmd_bitmap {
+    uint32_t nx;
+    uint32_t ny;
+    std::vector<unsigned char> data;
+    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
+    bool is_audio = false; // true if the bitmap is audio
+};
+
+struct mtmd_image_tokens {
+    uint32_t nx; // number of tokens in x direction
+    uint32_t ny; // number of tokens in y direction
+    bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
+    uint32_t n_tokens() const { return nx * ny; }
+    clip_image_f32_batch batch_f32; // preprocessed image patches
+    std::string id; // optional user-defined ID, useful for KV cache tracking
+
+    mtmd_image_tokens clone() {
+        return mtmd_image_tokens{
+            nx,
+            ny,
+            use_mrope_pos,
+            batch_f32.clone(),
+            id
+        };
+    }
+};
+using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
+
+struct mtmd_audio_tokens {
+    uint32_t n_tokens; // number of tokens
+    clip_image_f32_batch batch_f32; // preprocessed image patches
+    std::string id; // optional user-defined ID, useful for KV cache tracking
+
+    mtmd_audio_tokens clone() {
+        return mtmd_audio_tokens{
+            n_tokens,
+            batch_f32.clone(),
+            id
+        };
+    }
+};
+using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>;
+
+struct mtmd_input_chunk {
+    mtmd_input_chunk_type type;
+    std::vector<llama_token> tokens_text;
+    mtmd_image_tokens_ptr tokens_image;
+    mtmd_audio_tokens_ptr tokens_audio;
+};
+
+struct mtmd_input_chunks {
+    std::vector<mtmd_input_chunk> entries;
+};
+
+// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
+// models not having it (llava-1.6) will process embeddings without any special tokens in-between
+enum mtmd_slice_tmpl {
+    MTMD_SLICE_TMPL_NONE,
+    MTMD_SLICE_TMPL_MINICPMV_2_5,
+    MTMD_SLICE_TMPL_MINICPMV_2_6,
+    MTMD_SLICE_TMPL_LLAMA4,
+    MTMD_SLICE_TMPL_IDEFICS3,
+    MTMD_SLICE_TMPL_LFM2,
+};
+
+const char * mtmd_default_marker() {
+    return "<__media__>";
+}
+
+static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) {
+    switch (flash_attn_type) {
+        case LLAMA_FLASH_ATTN_TYPE_AUTO:     return CLIP_FLASH_ATTN_TYPE_AUTO;
+        case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED;
+        case LLAMA_FLASH_ATTN_TYPE_ENABLED:  return CLIP_FLASH_ATTN_TYPE_ENABLED;
+    }
+    return CLIP_FLASH_ATTN_TYPE_AUTO;
+}
+
+mtmd_context_params mtmd_context_params_default() {
+    mtmd_context_params params {
+        /* use_gpu           */ true,
+        /* print_timings     */ true,
+        /* n_threads         */ 4,
+        /* image_marker      */ MTMD_DEFAULT_IMAGE_MARKER,
+        /* media_marker      */ mtmd_default_marker(),
+        /* flash_attn_type   */ LLAMA_FLASH_ATTN_TYPE_AUTO,
+        /* warmup            */ true,
+        /* image_min_tokens  */ -1,
+        /* image_max_tokens  */ -1,
+        /* cb_eval           */ nullptr,
+        /* cb_eval_user_data */ nullptr,
+    };
+    return params;
+}
+
+struct mtmd_context {
+    struct clip_ctx * ctx_v; // vision
+    struct clip_ctx * ctx_a; // audio
+    const struct llama_model * text_model;
+    std::vector<float> image_embd_v; // image embedding vector
+
+    bool print_timings;
+    int n_threads;
+    std::string media_marker;
+    const int n_embd_text;
+
+    // these are not token, but strings used to mark the beginning and end of image/audio embeddings
+    std::string img_beg;
+    std::string img_end;
+    std::string aud_beg;
+    std::string aud_end;
+
+    // for llava-uhd style models, we need special tokens in-between slices
+    // minicpmv calls them "slices", llama 4 calls them "tiles"
+    mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
+    std::vector<llama_token> tok_ov_img_start;  // overview image
+    std::vector<llama_token> tok_ov_img_end;    // overview image
+    std::vector<llama_token> tok_slices_start;  // start of all slices
+    std::vector<llama_token> tok_slices_end;    // end of all slices
+    std::vector<llama_token> tok_sli_img_start; // single slice start
+    std::vector<llama_token> tok_sli_img_end;   // single slice end
+    std::vector<llama_token> tok_sli_img_mid;   // between 2 slices
+    std::vector<llama_token> tok_row_end;       // end of row
+    bool        tok_row_end_trail = false;
+    bool        ov_img_first      = false;
+
+    // string template for slice image delimiters with row/col (idefics3)
+    std::string sli_img_start_tmpl;
+
+    std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
+
+    // TODO @ngxson : add timings
+
+    mtmd_context(const char * mmproj_fname,
+                   const llama_model * text_model,
+                   const mtmd_context_params & ctx_params) :
+        text_model   (text_model),
+        print_timings(ctx_params.print_timings),
+        n_threads    (ctx_params.n_threads),
+        media_marker (ctx_params.media_marker),
+        n_embd_text  (llama_model_n_embd_inp(text_model))
+    {
+        if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
+            throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
+        }
+
+        if (media_marker.empty()) {
+            throw std::runtime_error("media_marker must not be empty");
+        }
+
+        clip_context_params ctx_clip_params {
+            /* use_gpu           */ ctx_params.use_gpu,
+            /* flash_attn_type   */ CLIP_FLASH_ATTN_TYPE_AUTO,
+            /* image_min_tokens  */ ctx_params.image_min_tokens,
+            /* image_max_tokens  */ ctx_params.image_max_tokens,
+            /* warmup            */ ctx_params.warmup,
+            /* cb_eval           */ ctx_params.cb_eval,
+            /* cb_eval_user_data */ ctx_params.cb_eval_user_data,
+        };
+
+        auto res = clip_init(mmproj_fname, ctx_clip_params);
+        ctx_v = res.ctx_v;
+        ctx_a = res.ctx_a;
+        if (!ctx_v && !ctx_a) {
+            throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
+        }
+
+        // if both vision and audio mmproj are present, we need to validate their n_embd
+        if (ctx_v && ctx_a) {
+            int n_embd_v = clip_n_mmproj_embd(ctx_v);
+            int n_embd_a = clip_n_mmproj_embd(ctx_a);
+            if (n_embd_v != n_embd_a) {
+                throw std::runtime_error(string_format(
+                    "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
+                    n_embd_v, n_embd_a));
+            }
+        }
+
+        // since we already validate n_embd of vision and audio mmproj,
+        // we can safely assume that they are the same
+        int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
+        if (n_embd_text != n_embd_clip) {
+            throw std::runtime_error(string_format(
+                "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
+                "hint: you may be using wrong mmproj\n",
+                n_embd_text, n_embd_clip));
+        }
+        if (ctx_v) {
+            init_vision();
+        }
+        if (ctx_a) {
+            init_audio();
+        }
+    }
+
+    void init_vision() {
+        GGML_ASSERT(ctx_v != nullptr);
+
+        projector_type proj = clip_get_projector_type(ctx_v);
+        int minicpmv_version = clip_is_minicpmv(ctx_v);
+        if (minicpmv_version == 2) {
+            // minicpmv 2.5 format:
+            // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
+            slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_5;
+            tok_ov_img_start  = {lookup_token("<image>")};
+            tok_ov_img_end    = {lookup_token("</image>")};
+            tok_slices_start  = {lookup_token("<slice>")};
+            tok_slices_end    = {lookup_token("</slice>")};
+            tok_sli_img_start = tok_ov_img_start;
+            tok_sli_img_end   = tok_ov_img_end;
+            tok_row_end       = {lookup_token("\n")};
+            tok_row_end_trail = false; // no trailing end-of-row token
+            ov_img_first      = true;
+
+        } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
+            // minicpmv 2.6 format:
+            // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
+            slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
+            tok_ov_img_start  = {lookup_token("<image>")};
+            tok_ov_img_end    = {lookup_token("</image>")};
+            tok_sli_img_start = {lookup_token("<slice>")};
+            tok_sli_img_end   = {lookup_token("</slice>")};
+            tok_row_end       = {lookup_token("\n")};
+            tok_row_end_trail = false; // no trailing end-of-row token
+            ov_img_first      = true;
+
+        } else if (minicpmv_version != 0) {
+            GGML_ASSERT(false && "unsupported minicpmv version");
+        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
+            // llama 4 format:
+            // <|image_start|>
+            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
+            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
+            //     ... <|tile_y_separator|>   <-- trailing end-of-row token
+            // <|image|> (overview)           <-- overview image is last
+            // <|image_end|>
+            slice_tmpl        = MTMD_SLICE_TMPL_LLAMA4;
+            tok_ov_img_start  = {lookup_token("<|image|>")};
+            tok_sli_img_mid   = {lookup_token("<|tile_x_separator|>")};
+            tok_row_end       = {lookup_token("<|tile_y_separator|>")};
+            tok_row_end_trail = true; // add trailing end-of-row token
+            ov_img_first      = false; // overview image is last
+        }
+
+        // set boi/eoi
+        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
+            // <start_of_image> ... (image embeddings) ... <end_of_image>
+            img_beg = "<start_of_image>";
+            img_end = "<end_of_image>";
+
+        } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
+            // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
+            slice_tmpl         = MTMD_SLICE_TMPL_IDEFICS3;
+            tok_ov_img_start   = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
+            tok_ov_img_end     = {lookup_token("<fake_token_around_image>")};
+            tok_row_end        = {lookup_token("\n")};
+            sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
+
+        } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
+            // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
+            img_end = "[IMG_END]";
+
+        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
+            // <|vision_start|> ... (image embeddings) ... <|vision_end|>
+            img_beg = "<|vision_start|>";
+            img_end = "<|vision_end|>";
+
+        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
+            // (more details in mtmd_context constructor)
+            img_beg = "<|image_start|>";
+            img_end = "<|image_end|>";
+            LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
+                    "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
+
+        } else if (proj == PROJECTOR_TYPE_INTERNVL) {
+            // <img> ... (image embeddings) ... </img>
+            img_beg = "<img>";
+            img_end = "</img>";
+
+        } else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
+            // <|im_start|> ... (image embeddings) ... <|im_end|>
+            img_beg = "<|im_start|>";
+            img_end = "<|im_end|>";
+
+        } else if (proj == PROJECTOR_TYPE_LFM2) {
+            // multi-tile:
+            //   <|image_start|>
+            //     <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
+            //     <|img_thumbnail|> (thumbnail)
+            //   <|image_end|>
+            // single-tile:
+            //   <|image_start|> (image) <|image_end|>
+            img_beg            = "<|image_start|>";
+            img_end            = "<|image_end|>";
+            slice_tmpl         = MTMD_SLICE_TMPL_LFM2;
+            sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
+            tok_ov_img_start   = {lookup_token("<|img_thumbnail|>")};
+            ov_img_first       = false;
+        } else if (proj == PROJECTOR_TYPE_GLM4V) {
+            img_beg = "<|begin_of_image|>";
+            img_end = "<|end_of_image|>";
+
+        }
+    }
+
+    void init_audio() {
+        GGML_ASSERT(ctx_a != nullptr);
+        projector_type proj = clip_get_projector_type(ctx_a);
+
+        LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
+                "    https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
+
+        // set preprocessor
+        switch (proj) {
+            case PROJECTOR_TYPE_QWEN2A:
+            case PROJECTOR_TYPE_QWEN25O:
+            case PROJECTOR_TYPE_ULTRAVOX:
+            case PROJECTOR_TYPE_VOXTRAL:
+            case PROJECTOR_TYPE_GLMA:
+            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+                audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
+                break;
+            case PROJECTOR_TYPE_LFM2A:
+                audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
+                break;
+            default:
+                GGML_ABORT("unsupported audio projector type");
+        }
+
+        // initialize audio preprocessor
+        audio_preproc->initialize();
+
+        // set special tokens
+        if (proj == PROJECTOR_TYPE_QWEN2A) {
+            // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
+            aud_beg = "<|audio_bos|>";
+            aud_end = "<|audio_eos|>";
+
+        } else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
+            // [BEGIN_AUDIO] ... (embeddings) ...
+            aud_beg = "[BEGIN_AUDIO]";
+
+        } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
+            // <sound> ... (embeddings) ...
+            aud_beg = "<sound>";
+        }
+    }
+
+    // get clip ctx based on chunk type
+    clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
+        if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            return ctx_v;
+        } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+            return ctx_a;
+        }
+        GGML_ABORT("unknown chunk type");
+    }
+
+    projector_type proj_type_v() const {
+        return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
+    }
+
+    projector_type proj_type_a() const {
+        return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
+    }
+
+    ~mtmd_context() {
+        clip_free(ctx_a);
+        clip_free(ctx_v);
+    }
+
+private:
+    llama_token lookup_token(const std::string & token_text) {
+        const llama_vocab * vocab = llama_model_get_vocab(text_model);
+        const int n_vocab = llama_vocab_n_tokens(vocab);
+        for (int i = 0; i < n_vocab; i++) {
+            if (token_to_piece(vocab, i, true) == token_text) {
+                return i;
+            }
+        }
+        return LLAMA_TOKEN_NULL;
+    }
+
+    std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
+        std::string piece;
+        piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
+        const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+        if (n_chars < 0) {
+            piece.resize(-n_chars);
+            int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+            GGML_ASSERT(check == -n_chars);
+        } else {
+            piece.resize(n_chars);
+        }
+        return piece;
+    }
+};
+
+mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+        const struct llama_model * text_model,
+        const struct mtmd_context_params ctx_params) {
+    try {
+        return new mtmd_context(mmproj_fname, text_model, ctx_params);
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: error: %s\n", __func__, e.what());
+        return nullptr;
+    }
+}
+
+void mtmd_free(mtmd_context * ctx) {
+    delete ctx;
+}
+
+struct mtmd_tokenizer {
+    mtmd_context * ctx;
+    std::vector<const mtmd_bitmap *> bitmaps;
+
+    std::string input_text;
+    bool add_special;
+    bool parse_special;
+    const llama_vocab * vocab;
+
+    mtmd_input_chunks cur;
+
+    mtmd_tokenizer(mtmd_context * ctx,
+            const mtmd_input_text * text,
+            const mtmd_bitmap ** bitmaps,
+            size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
+        add_special   = text->add_special;
+        parse_special = text->parse_special;
+        input_text    = text->text;
+        vocab         = llama_model_get_vocab(ctx->text_model);
+
+        // for compatibility, we convert image marker to media marker
+        string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
+    }
+
+    int32_t tokenize(mtmd_input_chunks * output) {
+        cur.entries.clear();
+        std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
+        size_t i_bm = 0; // index of the current bitmap
+        for (auto & part : parts) {
+            if (part == ctx->media_marker) {
+                // this is a marker, we should add the next bitmap
+                if (i_bm >= bitmaps.size()) {
+                    LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
+                            __func__, bitmaps.size(), parts.size() - 1);
+                    return 1;
+                }
+                const mtmd_bitmap * bitmap = bitmaps[i_bm++];
+                int32_t res = add_media(bitmap);
+                if (res != 0) {
+                    return res;
+                }
+            } else {
+                // this is a text part, we should add it as text
+                add_text(part, parse_special);
+            }
+        }
+
+        if (add_special && llama_vocab_get_add_bos(vocab)) {
+            // if first chunk is text, we add BOS token to first text chunk
+            // otherwise, create a new text chunk with BOS token
+            if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+                // add BOS token to the beginning of first text chunk
+                cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
+            } else {
+                // create a new text chunk with BOS token at the beginning
+                mtmd_input_chunk bos_chunk{
+                    MTMD_INPUT_CHUNK_TYPE_TEXT,
+                    {llama_vocab_bos(vocab)},
+                    nullptr, // image tokens
+                    nullptr, // audio tokens
+                };
+                cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
+            }
+        }
+
+        if (add_special && llama_vocab_get_add_eos(vocab)) {
+            // if last chunk is text, we add EOS token to it
+            add_text({llama_vocab_eos(vocab)});
+        }
+
+        if (i_bm != bitmaps.size()) {
+            LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
+                    __func__, bitmaps.size(), parts.size() - 1);
+            return 1;
+        }
+
+        *output = std::move(cur);
+
+        return 0;
+    }
+
+    void add_text(const std::string & txt, bool parse_special) {
+        LOG_DBG("%s: %s\n", __func__, txt.c_str());
+        auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
+        add_text(tokens);
+    }
+
+    void add_text(const std::vector<llama_token> & tokens) {
+        if (tokens.empty()) {
+            return;
+        }
+        // if last entry is also a text chunk, add tokens to it instead of creating new chunk
+        if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            cur.entries.back().tokens_text.insert(
+                                            cur.entries.back().tokens_text.end(),
+                                            tokens.begin(),
+                                            tokens.end());
+        } else {
+            mtmd_input_chunk chunk{
+                MTMD_INPUT_CHUNK_TYPE_TEXT,
+                tokens,
+                nullptr, // image tokens
+                nullptr, // audio tokens
+            };
+            cur.entries.emplace_back(std::move(chunk));
+        }
+    }
+
+    int32_t add_media(const mtmd_bitmap * bitmap) {
+        if (!bitmap->is_audio) {
+            // handle image
+
+            if (!ctx->ctx_v) {
+                LOG_ERR("%s: error: model does not support vision input\n", __func__);
+                return 2;
+            }
+
+            if (!ctx->img_beg.empty()) {
+                add_text(ctx->img_beg, true); // add image begin token
+            }
+
+            // convert mtmd_bitmap to clip_image_u8
+            clip_image_u8_ptr img_u8(clip_image_u8_init());
+            img_u8->nx = bitmap->nx;
+            img_u8->ny = bitmap->ny;
+            img_u8->buf.resize(bitmap->data.size());
+            std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
+
+            // preprocess image
+            clip_image_f32_batch batch_f32;
+            bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
+            if (!ok) {
+                LOG_ERR("Unable to preprocess image\n");
+                return 2;
+            }
+
+            // handle llava-uhd style preprocessing
+            const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
+            if (
+                ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
+                || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
+                || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
+                || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
+                || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
+            ) {
+                const int n_col = batch_f32.grid_x;
+                const int n_row = batch_f32.grid_y;
+                // split batch into chunks of single images
+                // NOTE: batch_f32 will be invalidated after this call
+                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
+                GGML_ASSERT(chunks.size() > 0);
+
+                auto ov_chunk = std::move(chunks.front());
+                chunks.erase(chunks.begin());
+
+                // add overview image (first)
+                if (ctx->ov_img_first) {
+                    add_text(ctx->tok_ov_img_start);
+                    cur.entries.emplace_back(std::move(ov_chunk));
+                    add_text(ctx->tok_ov_img_end);
+                }
+
+                // add slices (or tiles)
+                if (!chunks.empty()) {
+                    GGML_ASSERT((int)chunks.size() == n_row * n_col);
+                    add_text(ctx->tok_slices_start);
+                    for (int y = 0; y < n_row; y++) {
+                        for (int x = 0; x < n_col; x++) {
+                            const bool is_last_in_row = (x == n_col - 1);
+                            if (!ctx->tok_sli_img_start.empty()) {
+                                add_text(ctx->tok_sli_img_start);
+                            } else if (!ctx->sli_img_start_tmpl.empty()) {
+                                // If using a template to preceed a slice image
+                                const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1;
+                                std::unique_ptr<char[]> buf(new char[sz]);
+                                std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
+                                add_text(std::string(buf.get(), buf.get() + sz - 1), true);
+                            }
+                            cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
+                            add_text(ctx->tok_sli_img_end);
+                            if (!is_last_in_row) {
+                                add_text(ctx->tok_sli_img_mid);
+                            }
+                        }
+                        if ((y != n_row - 1 || ctx->tok_row_end_trail)) {
+                            add_text(ctx->tok_row_end);
+                        }
+                    }
+                    add_text(ctx->tok_slices_end);
+                }
+
+                // add overview image (last)
+                if (!ctx->ov_img_first) {
+                    add_text(ctx->tok_ov_img_start);
+                    cur.entries.emplace_back(std::move(ov_chunk));
+                    add_text(ctx->tok_ov_img_end);
+                }
+
+            } else {
+                size_t n_tokens = 0;
+                for (const auto & entry : batch_f32.entries) {
+                    n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
+                }
+
+                mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+                if (mtmd_decode_use_mrope(ctx)) {
+                    // for Qwen2VL, we need this information for M-RoPE decoding positions
+                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
+                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
+                    image_tokens->use_mrope_pos = true;
+                } else {
+                    // other models, we only need the total number of tokens
+                    image_tokens->nx = n_tokens;
+                    image_tokens->ny = 1;
+                }
+                image_tokens->batch_f32 = std::move(batch_f32);
+                image_tokens->id = bitmap->id; // optional
+
+                LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
+                LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
+                LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
+
+                mtmd_input_chunk chunk{
+                    MTMD_INPUT_CHUNK_TYPE_IMAGE,
+                    {}, // text tokens
+                    std::move(image_tokens),
+                    nullptr, // audio tokens
+                };
+                cur.entries.emplace_back(std::move(chunk));
+            }
+
+            if (!ctx->img_end.empty()) {
+                add_text(ctx->img_end, true); // add image end token
+            }
+
+        } else {
+            // handle audio
+
+            if (!ctx->ctx_a) {
+                LOG_ERR("%s: error: model does not support audio input\n", __func__);
+                return 2;
+            }
+
+            if (bitmap->data.size() == 0) {
+                LOG_ERR("%s: error: empty audio data\n", __func__);
+                return 2;
+            }
+
+            if (!ctx->aud_beg.empty()) {
+                add_text(ctx->aud_beg, true); // add audio begin token
+            }
+
+            // preprocess audio
+            std::vector<mtmd_audio_mel> mel_spec_chunks;
+            const float * samples = (const float *)bitmap->data.data();
+            size_t n_samples = bitmap->data.size() / sizeof(float);
+            bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
+            if (!ok) {
+                LOG_ERR("Unable to preprocess audio\n");
+                return 2;
+            }
+
+            // consider each mel_spec as a separate audio chunk
+            // TODO: maybe support batching, but this may come with memory cost
+            for (auto & mel_spec : mel_spec_chunks) {
+                clip_image_f32_ptr mel_f32(clip_image_f32_init());
+                mel_f32->nx  = mel_spec.n_len;
+                mel_f32->ny  = mel_spec.n_mel;
+                mel_f32->buf = std::move(mel_spec.data);
+                size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
+
+                clip_image_f32_batch batch_f32;
+                batch_f32.is_audio = true;
+                batch_f32.entries.push_back(std::move(mel_f32));
+
+                mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
+                audio_tokens->n_tokens = n_tokens;
+                audio_tokens->batch_f32 = std::move(batch_f32);
+                audio_tokens->id = bitmap->id; // optional
+
+                LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
+
+                mtmd_input_chunk chunk{
+                    MTMD_INPUT_CHUNK_TYPE_AUDIO,
+                    {}, // text tokens
+                    nullptr, // image tokens
+                    std::move(audio_tokens),
+                };
+                cur.entries.emplace_back(std::move(chunk));
+            }
+
+            if (!ctx->aud_end.empty()) {
+                add_text(ctx->aud_end, true); // add audio end token
+            }
+        }
+
+        return 0;
+    }
+
+    std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
+        std::vector<mtmd_input_chunk> chunks;
+
+        for (auto & entry : batch_f32.entries) {
+            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+            image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
+            image_tokens->ny = 1;
+            image_tokens->batch_f32.entries.push_back(std::move(entry));
+            image_tokens->id = id;
+
+            mtmd_input_chunk chunk{
+                MTMD_INPUT_CHUNK_TYPE_IMAGE,
+                {}, // text tokens
+                std::move(image_tokens),
+                nullptr, // audio tokens
+            };
+            chunks.emplace_back(std::move(chunk));
+        }
+
+        return chunks;
+    }
+
+    // for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
+    static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
+        std::vector<std::string> result;
+        if (input.empty()) {
+            return result;
+        }
+        size_t start = 0;
+        size_t pos = 0;
+        while ((pos = input.find(delimiter, start)) != std::string::npos) {
+            if (pos > start) {
+                result.push_back(input.substr(start, pos - start));
+            }
+            result.push_back(delimiter);
+            start = pos + delimiter.length();
+        }
+        if (start < input.length()) {
+            result.push_back(input.substr(start));
+        }
+        return result;
+    }
+
+    // copied from common_tokenize
+    static std::vector<llama_token> mtmd_tokenize_text_internal(
+        const struct llama_vocab * vocab,
+               const std::string & text,
+                            bool   add_special,
+                            bool   parse_special) {
+        // upper limit for the number of tokens
+        int n_tokens = text.length() + 2 * add_special;
+        std::vector<llama_token> result(n_tokens);
+        n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        if (n_tokens < 0) {
+            result.resize(-n_tokens);
+            int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+            GGML_ASSERT(check == -n_tokens);
+        } else {
+            result.resize(n_tokens);
+        }
+        return result;
+    }
+};
+
+int32_t mtmd_tokenize(mtmd_context * ctx,
+            mtmd_input_chunks * output,
+            const mtmd_input_text * text,
+            const mtmd_bitmap ** bitmaps,
+            size_t n_bitmaps) {
+    mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
+    return tokenizer.tokenize(output);
+}
+
+int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
+        return 0;
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        if (!ctx->ctx_v) {
+            LOG_ERR("%s: model does not support vision input\n", __func__);
+            return 1;
+        }
+        return mtmd_encode(ctx, chunk->tokens_image.get());
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        if (!ctx->ctx_a) {
+            LOG_ERR("%s: model does not support audio input\n", __func__);
+            return 1;
+        }
+        int n_mmproj_embd = ctx->n_embd_text;
+        ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
+        bool ok = clip_image_batch_encode(
+            ctx->ctx_a,
+            ctx->n_threads,
+            &chunk->tokens_audio->batch_f32,
+            ctx->image_embd_v.data());
+        return ok ? 0 : 1;
+    }
+
+    LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
+    return 1;
+}
+
+int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
+    clip_ctx * ctx_clip = ctx->ctx_v;
+    if (!ctx_clip) {
+        LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
+        return 1;
+    }
+    int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
+    ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
+    bool ok = false;
+
+    if (clip_is_llava(ctx_clip)
+        || clip_is_minicpmv(ctx_clip)
+        || clip_is_glm(ctx_clip)) {
+        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
+        const auto & entries = image_tokens->batch_f32.entries;
+        for (size_t i = 0; i < entries.size(); i++) {
+            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
+            ok = clip_image_encode(
+                ctx_clip,
+                ctx->n_threads,
+                entries[i].get(),
+                ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
+        }
+    } else {
+        ok = clip_image_batch_encode(
+            ctx_clip,
+            ctx->n_threads,
+            &image_tokens->batch_f32,
+            ctx->image_embd_v.data());
+    }
+
+    return ok ? 0 : 1;
+}
+
+float * mtmd_get_output_embd(mtmd_context * ctx) {
+    return ctx->image_embd_v.data();
+}
+
+bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
+    switch (ctx->proj_type_v()) {
+        case PROJECTOR_TYPE_GEMMA3:
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool mtmd_decode_use_mrope(mtmd_context * ctx) {
+    switch (ctx->proj_type_v()) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool mtmd_support_vision(mtmd_context * ctx) {
+    return ctx->ctx_v != nullptr;
+}
+
+bool mtmd_support_audio(mtmd_context * ctx) {
+    return ctx->ctx_a != nullptr;
+}
+
+int mtmd_get_audio_bitrate(mtmd_context * ctx) {
+    if (!ctx->ctx_a) {
+        return -1;
+    }
+    return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
+}
+
+//
+// public API functions
+//
+
+// mtmd_bitmap
+
+mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
+                               uint32_t ny,
+                               const unsigned char * data) {
+    mtmd_bitmap * bitmap = new mtmd_bitmap;
+    bitmap->nx = nx;
+    bitmap->ny = ny;
+    size_t data_size = (size_t)nx * ny * 3;
+    bitmap->data.resize(data_size);
+    std::memcpy(bitmap->data.data(), data, data_size);
+    return bitmap;
+}
+
+mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
+                                          const float * data) {
+    mtmd_bitmap * bitmap = new mtmd_bitmap;
+    bitmap->nx = n_samples;
+    bitmap->ny = 1;
+    bitmap->is_audio = true;
+    size_t data_size = n_samples * sizeof(float);
+    bitmap->data.resize(data_size);
+    std::memcpy(bitmap->data.data(), data, data_size);
+    return bitmap;
+}
+
+uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
+    return bitmap->nx;
+}
+
+uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
+    return bitmap->ny;
+}
+
+const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
+    return bitmap->data.data();
+}
+
+size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
+    return bitmap->data.size();
+}
+
+bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
+    return bitmap->is_audio;
+}
+
+const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
+    return bitmap->id.c_str();
+}
+
+void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id) {
+    if (id) {
+        bitmap->id = std::string(id);
+    } else {
+        bitmap->id.clear();
+    }
+}
+
+void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
+    if (bitmap) {
+        delete bitmap;
+    }
+}
+
+// mtmd_input_chunks
+
+mtmd_input_chunks * mtmd_input_chunks_init() {
+    return new mtmd_input_chunks;
+}
+
+size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks) {
+    return chunks->entries.size();
+}
+
+const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx) {
+    if (idx >= chunks->entries.size()) {
+        return nullptr;
+    }
+    return &chunks->entries[idx];
+}
+
+void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
+    if (chunks) {
+        delete chunks;
+    }
+}
+
+// mtmd_input_chunk
+
+enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk) {
+    return chunk->type;
+}
+
+const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        *n_tokens_output = chunk->tokens_text.size();
+        return chunk->tokens_text.data();
+    }
+    *n_tokens_output = 0;
+    return nullptr;
+}
+
+const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return chunk->tokens_image.get();
+    }
+    return nullptr;
+}
+
+size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        return chunk->tokens_text.size();
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return mtmd_image_tokens_get_n_tokens(chunk->tokens_image.get());
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        return chunk->tokens_audio->n_tokens;
+    } else {
+        GGML_ABORT("invalid chunk type");
+    }
+}
+
+llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        return chunk->tokens_text.size();
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return mtmd_image_tokens_get_n_pos(chunk->tokens_image.get());
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        return chunk->tokens_audio->n_tokens;
+    } else {
+        GGML_ABORT("invalid chunk type");
+    }
+}
+
+const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return chunk->tokens_image->id.c_str();
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        return chunk->tokens_audio->id.c_str();
+    }
+    return nullptr;
+}
+
+mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
+    mtmd_input_chunk * copy = new mtmd_input_chunk{
+        chunk->type,
+        chunk->tokens_text,
+        nullptr,
+        nullptr,
+    };
+    if (chunk->tokens_image) {
+        // copy the image tokens
+        copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
+        *copy->tokens_image = chunk->tokens_image->clone();
+    }
+    if (chunk->tokens_audio) {
+        // copy the audio tokens
+        copy->tokens_audio = mtmd_audio_tokens_ptr(new mtmd_audio_tokens());
+        *copy->tokens_audio = chunk->tokens_audio->clone();
+    }
+    return copy;
+}
+
+void mtmd_input_chunk_free(mtmd_input_chunk * chunk) {
+    if (chunk) {
+        delete chunk;
+    }
+}
+
+// mtmd_image_tokens
+
+size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->n_tokens();
+}
+
+size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->nx;
+}
+
+size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->ny;
+}
+
+const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->id.c_str();
+}
+
+llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
+    if (image_tokens->use_mrope_pos) {
+        // for M-RoPE, temporal dimension = max(t,h,w)
+        // t is omitted as we don't support video input
+        return std::max(image_tokens->nx, image_tokens->ny);
+    }
+    return image_tokens->n_tokens();
+}
+
+// test function
+
+mtmd_input_chunks * mtmd_test_create_input_chunks() {
+    mtmd_input_chunks * chunks = mtmd_input_chunks_init();
+    if (!chunks) {
+        return nullptr;
+    }
+
+    // create a text chunk
+    std::vector<llama_token> tokens_text = { 1, 2, 3, 4, 5 };
+    mtmd_input_chunk chunk_text{
+        MTMD_INPUT_CHUNK_TYPE_TEXT,
+        std::move(tokens_text),
+        nullptr, // image tokens
+        nullptr, // audio tokens
+    };
+    chunks->entries.emplace_back(std::move(chunk_text));
+
+    // create an image chunk
+    mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+    image_tokens->nx = 4;
+    image_tokens->ny = 4;
+    image_tokens->batch_f32.entries.resize(16);
+    image_tokens->id = "image_1";
+    mtmd_input_chunk chunk_image{
+        MTMD_INPUT_CHUNK_TYPE_IMAGE,
+        {}, // text tokens
+        std::move(image_tokens),
+        nullptr, // audio tokens
+    };
+    chunks->entries.emplace_back(std::move(chunk_image));
+
+    return chunks;
+}
+
+void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
+    g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default;
+    g_logger_state.log_callback_user_data = user_data;
+}
diff --git a/llama.cpp/tools/mtmd/mtmd.h b/llama.cpp/tools/mtmd/mtmd.h
new file mode 100644
index 0000000..ef25d32
--- /dev/null
+++ b/llama.cpp/tools/mtmd/mtmd.h
@@ -0,0 +1,319 @@
+#ifndef MTMD_H
+#define MTMD_H
+
+#include "ggml.h"
+#include "llama.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+#include <string>
+#include <vector>
+#include <cinttypes>
+#include <memory>
+#endif
+
+/**
+ * libmtmd: A library for multimodal support in llama.cpp.
+ *
+ * WARNING: This API is experimental and subject to many BREAKING CHANGES.
+ *          Issues related to API usage may receive lower priority support.
+ *
+ * For the usage, see an example in mtmd-cli.cpp
+ *
+ * For contributors:
+ * - Make sure the C API is aligned with the libllama C API (as in llama.h)
+ * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
+ * - Keep the API minimal, do not expose internal details unless necessary
+ *
+ * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
+ * We encourage human contributors to ensure the quality and reliability of the codebase.
+ */
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define MTMD_API __declspec(dllexport)
+#        else
+#            define MTMD_API __declspec(dllimport)
+#        endif
+#    else
+#        define MTMD_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define MTMD_API
+#endif
+
+// deprecated marker, use mtmd_default_marker() instead
+#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum mtmd_input_chunk_type {
+    MTMD_INPUT_CHUNK_TYPE_TEXT,
+    MTMD_INPUT_CHUNK_TYPE_IMAGE,
+    MTMD_INPUT_CHUNK_TYPE_AUDIO,
+};
+
+// opaque types
+struct mtmd_context;
+struct mtmd_bitmap;
+struct mtmd_image_tokens;
+struct mtmd_input_chunk;
+struct mtmd_input_chunks;
+
+struct mtmd_input_text {
+    const char * text;
+    bool add_special;
+    bool parse_special;
+};
+
+//
+// C API
+//
+
+typedef struct mtmd_context      mtmd_context;
+typedef struct mtmd_bitmap       mtmd_bitmap;
+typedef struct mtmd_image_tokens mtmd_image_tokens;
+typedef struct mtmd_input_chunk  mtmd_input_chunk;
+typedef struct mtmd_input_chunks mtmd_input_chunks;
+typedef struct mtmd_input_text   mtmd_input_text;
+
+struct mtmd_context_params {
+    bool use_gpu;
+    bool print_timings;
+    int n_threads;
+    const char * image_marker; // deprecated, use media_marker instead
+    const char * media_marker;
+    enum llama_flash_attn_type flash_attn_type;
+    bool warmup; // whether to run a warmup encode pass after initialization
+
+    // limit number of image tokens, only for vision models with dynamic resolution
+    int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
+    int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
+
+    // callback function passed over to mtmd proper
+    ggml_backend_sched_eval_callback cb_eval;
+    void * cb_eval_user_data;
+};
+
+MTMD_API const char * mtmd_default_marker(void);
+
+MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
+
+// initialize the mtmd context
+// return nullptr on failure
+MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+                                            const struct llama_model * text_model,
+                                            const struct mtmd_context_params ctx_params);
+
+MTMD_API void mtmd_free(mtmd_context * ctx);
+
+// whether we need to set non-causal mask before llama_decode
+MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
+
+// whether the current model use M-RoPE for llama_decode
+MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
+
+// whether the current model supports vision input
+MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+
+// whether the current model supports audio input
+MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
+
+// get audio bitrate in Hz, for example 16000 for Whisper
+// return -1 if audio is not supported
+MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
+
+// mtmd_bitmap
+//
+// if bitmap is image:
+//     length of data must be nx * ny * 3
+//     the data is in RGBRGBRGB... format
+// if bitmap is audio:
+//     length of data must be n_samples * sizeof(float)
+//     the data is in float format (PCM F32)
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
+MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap);
+MTMD_API uint32_t              mtmd_bitmap_get_ny     (const mtmd_bitmap * bitmap);
+MTMD_API const unsigned char * mtmd_bitmap_get_data   (const mtmd_bitmap * bitmap);
+MTMD_API size_t                mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
+MTMD_API bool                  mtmd_bitmap_is_audio   (const mtmd_bitmap * bitmap);
+MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap);
+// bitmap ID is optional, but useful for KV cache tracking
+// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
+MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
+MTMD_API void         mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
+
+
+// mtmd_input_chunks
+//
+// this is simply a list of mtmd_input_chunk
+// the elements can only be populated via mtmd_tokenize()
+MTMD_API mtmd_input_chunks *      mtmd_input_chunks_init(void);
+MTMD_API size_t                   mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
+MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
+MTMD_API void                     mtmd_input_chunks_free(mtmd_input_chunks * chunks);
+
+// mtmd_input_chunk
+//
+// the instance will be constructed via mtmd_tokenize()
+// it will be freed along with mtmd_input_chunks
+MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type        (const mtmd_input_chunk * chunk);
+MTMD_API const llama_token *        mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
+MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
+MTMD_API size_t                     mtmd_input_chunk_get_n_tokens    (const mtmd_input_chunk * chunk);
+// returns nullptr for ID on text chunk
+MTMD_API const char *               mtmd_input_chunk_get_id          (const mtmd_input_chunk * chunk);
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
+MTMD_API llama_pos                  mtmd_input_chunk_get_n_pos       (const mtmd_input_chunk * chunk);
+
+// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
+// you can move the chunk ownership to your own code by copying it
+// remember to free the chunk when you are done with it
+MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
+MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
+
+
+// mtmd_image_tokens
+//
+// the instance will be constructed via mtmd_tokenize()
+// it will be freed along with mtmd_input_chunk
+MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
+MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
+MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens); // TODO: deprecate
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
+MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens); // TODO: deprecate
+
+// tokenize an input text prompt and a list of bitmaps (images/audio)
+// the prompt must have the input image marker (default: "<__media__>") in it
+// the default marker is defined by mtmd_default_marker()
+// the marker will be replaced with the image/audio chunk
+// for example:
+//   "here is an image: <__media__>\ndescribe it in detail."
+//   this will gives 3 chunks:
+//   1. "here is an image: <start_of_image>"
+//   2. (image/audio tokens)
+//   3. "<end_of_image>\ndescribe it in detail."
+// number of bitmaps must be equal to the number of markers in the prompt
+// this function is thread-safe (shared ctx)
+// return values:
+//   0 on success
+//   1 on number of bitmaps not matching the number of markers
+//   2 on image preprocessing error
+MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
+                               mtmd_input_chunks * output,
+                               const mtmd_input_text * text,
+                               const mtmd_bitmap ** bitmaps,
+                               size_t n_bitmaps);
+
+// returns 0 on success
+// TODO: deprecate
+MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
+                             const mtmd_image_tokens * image_tokens);
+
+// returns 0 on success
+MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
+                                   const mtmd_input_chunk * chunk);
+
+// get output embeddings from the last encode pass
+// the reading size (in bytes) is equal to:
+// llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
+MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
+
+// Set callback for all future logging events.
+// If this is not called, or NULL is supplied, everything is output on stderr.
+MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
+
+/////////////////////////////////////////
+
+// test function, to be used in test-mtmd-c-api.c
+MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+//
+// C++ wrappers
+//
+
+#ifdef __cplusplus
+
+namespace mtmd {
+
+struct mtmd_context_deleter {
+    void operator()(mtmd_context * val) { mtmd_free(val); }
+};
+using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
+
+struct mtmd_bitmap_deleter {
+    void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
+};
+using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
+
+struct mtmd_input_chunks_deleter {
+    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
+};
+using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
+
+struct mtmd_input_chunk_deleter {
+    void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
+};
+using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
+
+struct bitmap {
+    bitmap_ptr ptr;
+    bitmap() : ptr(nullptr) {}
+    bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
+    bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
+    bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
+        ptr.reset(mtmd_bitmap_init(nx, ny, data));
+    }
+    ~bitmap() = default;
+    uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
+    uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
+    const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
+    size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
+    std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
+    void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
+};
+
+struct bitmaps {
+    std::vector<bitmap> entries;
+    ~bitmaps() = default;
+    // return list of pointers to mtmd_bitmap
+    // example:
+    //   auto bitmaps_c_ptr = bitmaps.c_ptr();
+    //   int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
+    std::vector<const mtmd_bitmap *> c_ptr() {
+        std::vector<const mtmd_bitmap *> res(entries.size());
+        for (size_t i = 0; i < entries.size(); i++) {
+            res[i] = entries[i].ptr.get();
+        }
+        return res;
+    }
+};
+
+struct input_chunks {
+    input_chunks_ptr ptr;
+    input_chunks() = default;
+    input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
+    ~input_chunks() = default;
+    size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
+    const mtmd_input_chunk * operator[](size_t idx) const {
+        return mtmd_input_chunks_get(ptr.get(), idx);
+    }
+};
+
+} // namespace mtmd
+
+#endif
+
+#endif
diff --git a/llama.cpp/tools/mtmd/requirements.txt b/llama.cpp/tools/mtmd/requirements.txt
new file mode 100644
index 0000000..0a1f4e8
--- /dev/null
+++ b/llama.cpp/tools/mtmd/requirements.txt
@@ -0,0 +1,5 @@
+-r ../../requirements/requirements-convert_legacy_llama.txt
+--extra-index-url https://download.pytorch.org/whl/cpu
+pillow~=11.3.0
+torch~=2.6.0
+torchvision~=0.21.0
diff --git a/llama.cpp/tools/mtmd/test-1.jpeg b/llama.cpp/tools/mtmd/test-1.jpeg
new file mode 100644
index 0000000..7fdcaaf
Binary files /dev/null and b/llama.cpp/tools/mtmd/test-1.jpeg differ
diff --git a/llama.cpp/tools/mtmd/test-2.mp3 b/llama.cpp/tools/mtmd/test-2.mp3
new file mode 100644
index 0000000..aa9d7ec
Binary files /dev/null and b/llama.cpp/tools/mtmd/test-2.mp3 differ
diff --git a/llama.cpp/tools/mtmd/tests.sh b/llama.cpp/tools/mtmd/tests.sh
new file mode 100755
index 0000000..012958e
--- /dev/null
+++ b/llama.cpp/tools/mtmd/tests.sh
@@ -0,0 +1,183 @@
+#!/usr/bin/env bash
+
+# make sure we are in the right directory
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
+
+#export LLAMA_CACHE="$SCRIPT_DIR/tmp"
+
+set -eux
+
+mkdir -p $SCRIPT_DIR/output
+
+PROJ_ROOT="$SCRIPT_DIR/../.."
+cd $PROJ_ROOT
+
+# Check if the first argument is "big", then run test with big models
+# This is useful if we're running the script on a larger machine, so we can test the big models
+RUN_BIG_TESTS=false
+if [ "${1:-}" = "big" ]; then
+    RUN_BIG_TESTS=true
+    echo "Include BIG models..."
+fi
+
+RUN_HUGE_TESTS=false
+if [ "${1:-}" = "huge" ]; then
+    RUN_HUGE_TESTS=true
+    RUN_BIG_TESTS=true
+    echo "Include BIG and HUGE models..."
+fi
+
+###############
+
+arr_prefix=()
+arr_hf=()
+arr_extra_args=()
+arr_file=()
+
+add_test_vision() {
+    local hf=$1
+    shift
+    local extra_args=""
+    if [ $# -gt 0 ]; then
+        extra_args=$(printf " %q" "$@")
+    fi
+    arr_prefix+=("[vision]")
+    arr_hf+=("$hf")
+    arr_extra_args+=("$extra_args")
+    arr_file+=("test-1.jpeg")
+}
+
+add_test_audio() {
+    local hf=$1
+    shift
+    local extra_args=""
+    if [ $# -gt 0 ]; then
+        extra_args=$(printf " %q" "$@")
+    fi
+    arr_prefix+=("[audio] ")
+    arr_hf+=("$hf")
+    arr_extra_args+=("$extra_args")
+    arr_file+=("test-2.mp3")
+}
+
+add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
+add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
+add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
+add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
+add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
+add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
+add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
+add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
+add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
+add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
+add_test_vision "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
+add_test_vision "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
+add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
+add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
+add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
+add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
+add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0"
+add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0"
+add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
+
+add_test_audio  "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
+add_test_audio  "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
+add_test_audio  "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
+add_test_audio  "ggml-org/LFM2-Audio-1.5B-GGUF:Q8_0"
+
+# to test the big models, run: ./tests.sh big
+if [ "$RUN_BIG_TESTS" = true ]; then
+    add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
+    add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Qwen3-VL-2B-Instruct-GGUF:Q8_0"
+    add_test_vision "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
+    # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
+    # add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
+
+    add_test_audio  "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
+    add_test_audio  "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
+fi
+
+# to test the huge models, run: ./tests.sh huge
+# this will run both the big and huge models
+# huge models are > 32B parameters
+if [ "$RUN_HUGE_TESTS" = true ]; then
+    add_test_vision "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
+fi
+
+# these models always give the wrong answer, not sure why
+# add_test_vision "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
+# add_test_vision "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
+# add_test_vision "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
+
+# this model has broken chat template, not usable
+# add_test_vision "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
+# add_test_vision "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
+
+###############
+
+cmake --build build -j --target llama-mtmd-cli
+
+arr_res=()
+
+for i in "${!arr_hf[@]}"; do
+    bin="llama-mtmd-cli"
+    prefix="${arr_prefix[$i]}"
+    hf="${arr_hf[$i]}"
+    extra_args="${arr_extra_args[$i]}"
+    inp_file="${arr_file[$i]}"
+
+    echo "Running test with binary: $bin and HF model: $hf"
+    echo ""
+    echo ""
+
+    cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
+        -hf $(printf %q "$hf") \
+        --image $(printf %q "$SCRIPT_DIR/$inp_file") \
+        --temp 0 -n 128 \
+        ${extra_args}"
+
+    # if extra_args does not contain -p, we add a default prompt
+    if ! [[ "$extra_args" =~ "-p" ]]; then
+        cmd+=" -p \"what is the publisher name of the newspaper?\""
+    fi
+
+    output=$(eval "$cmd" 2>&1 | tee /dev/tty)
+
+    echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
+
+    # either contains "new york" or both "men" and "walk"
+    if echo "$output" | grep -iq "new york" \
+            || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
+    then
+        result="$prefix \033[32mOK\033[0m:   $hf"
+    else
+        result="$prefix \033[31mFAIL\033[0m: $hf"
+    fi
+    echo -e "$result"
+    arr_res+=("$result")
+
+    echo ""
+    echo ""
+    echo ""
+    echo "#################################################"
+    echo "#################################################"
+    echo ""
+    echo ""
+done
+
+set +x
+
+for i in "${!arr_res[@]}"; do
+    echo -e "${arr_res[$i]}"
+done
+echo ""
+echo "Output logs are saved in $SCRIPT_DIR/output"
diff --git a/llama.cpp/tools/perplexity/CMakeLists.txt b/llama.cpp/tools/perplexity/CMakeLists.txt
new file mode 100644
index 0000000..12b28b2
--- /dev/null
+++ b/llama.cpp/tools/perplexity/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-perplexity)
+add_executable(${TARGET} perplexity.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/llama.cpp/tools/perplexity/README.md b/llama.cpp/tools/perplexity/README.md
new file mode 100644
index 0000000..eb38460
--- /dev/null
+++ b/llama.cpp/tools/perplexity/README.md
@@ -0,0 +1,193 @@
+# Perplexity
+
+The `perplexity` example can be used to calculate the so-called perplexity value of a language model over a given text corpus.
+Perplexity measures how well the model can predict the next token with lower values being better.
+Note that perplexity is **not** directly comparable between models, especially if they use different tokenizers.
+Also note that finetunes typically result in a higher perplexity value even though the human-rated quality of outputs increases.
+
+Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
+The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
+When numbers are listed all command line arguments and compilation options are left at their defaults unless noted otherwise.
+llama.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.
+
+By default only the mean perplexity value and the corresponding uncertainty is calculated.
+The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
+
+More statistics can be obtained by recording the logits from the FP16 version of a model.
+To do this, supply `perplexity` with `--kl-divergence-base path/to/logit/binary/file.kld`.
+The program will then record all logits and save them to the provided path in binary format.
+**The logit file will be very large, 11 GiB for LLaMA 2 or 37 GiB for LLaMA 3 when using the Wikitext-2 test set.**
+Once you have the file, supply `perplexity` with the quantized model, the logits file via `--kl-divergence-base`,
+and finally the `--kl-divergence` argument to indicate that the program should calculate the so-called Kullback-Leibler divergence.
+This is a measure of how similar the FP16 and the quantized logit distributions are with a value of 0 indicating that the distribution are the same.
+The uncertainty on the mean KL divergence is calculated by assuming the KL divergence per token follows a Gaussian distribution.
+
+In addition to the KL divergence the following statistics are calculated with `--kl-divergence`:
+
+* Ratio of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated. The logarithm of this metric is also calculated and printed, it is 0 if the logit distributions are the same.
+* Difference of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated.
+* Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
+* Pearson correlation coefficient of the "correct" token probabilites between models.
+* Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
+* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggml-org/llama.cpp/discussions/2875 .
+* Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
+
+## LLaMA 3 8b Scoreboard
+
+| Revision | f364eb6f           |
+|:---------|:-------------------|
+| Backend  | CUDA               |
+| CPU      | AMD Epyc 7742      |
+| GPU      | 1x NVIDIA RTX 4090 |
+
+Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
+The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
+Note: the FP16 logits used for the calculation of all metrics other than perplexity are stored in a binary file between runs.
+In order to save space this file does **not** contain the exact same FP32 logits but instead casts them to 16 bit unsigned integers (with some scaling).
+So the "f16" results are to be understood as the difference resulting only from this downcast.
+
+| Quantization | imatrix | Model size [GiB] | PPL                    | ΔPPL                   | KLD                   | Mean Δp           | RMS Δp           |
+|--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------|
+| f16          | None    |            14.97 | 6.233160 ±   0.037828  | 0.001524 ±   0.000755  | 0.000551 ±   0.000002 |  0.001 ± 0.002 %  | 0.787 ± 0.004 %  |
+| q8_0         | None    |             7.96 | 6.234284 ±   0.037878  | 0.002650 ±   0.001006  | 0.001355 ±   0.000006 | -0.019 ± 0.003 %  | 1.198 ± 0.007 %  |
+| q6_K         | None    |             6.14 | 6.253382 ±   0.038078  | 0.021748 ±   0.001852  | 0.005452 ±   0.000035 | -0.007 ± 0.006 %  | 2.295 ± 0.019 %  |
+| q5_K_M       | None    |             5.33 | 6.288607 ±   0.038338  | 0.056974 ±   0.002598  | 0.010762 ±   0.000079 | -0.114 ± 0.008 %  | 3.160 ± 0.031 %  |
+| q5_K_S       | None    |             5.21 | 6.336598 ±   0.038755  | 0.104964 ±   0.003331  | 0.016595 ±   0.000122 | -0.223 ± 0.010 %  | 3.918 ± 0.036 %  |
+| q5_1         | None    |             5.65 | 6.337857 ±   0.038677  | 0.106223 ±   0.003476  | 0.018045 ±   0.000139 | -0.287 ± 0.011 %  | 4.123 ± 0.039 %  |
+| q5_0         | None    |             5.21 | 6.363224 ±   0.038861  | 0.131591 ±   0.003894  | 0.022239 ±   0.000166 | -0.416 ± 0.012 %  | 4.634 ± 0.043 %  |
+| q4_K_M       | WT 10m  |             4.58 | 6.382937 ±   0.039055  | 0.151303 ±   0.004429  | 0.028152 ±   0.000240 | -0.389 ± 0.014 %  | 5.251 ± 0.049 %  |
+| q4_K_M       | None    |             4.58 | 6.407115 ±   0.039119  | 0.175482 ±   0.004620  | 0.031273 ±   0.000238 | -0.596 ± 0.014 %  | 5.519 ± 0.050 %  |
+| q4_K_S       | WT 10m  |             4.37 | 6.409697 ±   0.039189  | 0.178064 ±   0.004744  | 0.031951 ±   0.000259 | -0.531 ± 0.015 %  | 5.645 ± 0.051 %  |
+| iq4_NL       | WT 10m  |             4.35 | 6.455593 ±   0.039630  | 0.223959 ±   0.005201  | 0.035742 ±   0.000288 | -0.590 ± 0.016 %  | 5.998 ± 0.054 %  |
+| iq4_XS       | WT 10m  |             4.14 | 6.459705 ±   0.039595  | 0.228071 ±   0.005207  | 0.036334 ±   0.000284 | -0.668 ± 0.016 %  | 6.044 ± 0.054 %  |
+| q4_K_S       | None    |             4.37 | 6.500529 ±   0.039778  | 0.268895 ±   0.005638  | 0.043136 ±   0.000314 | -0.927 ± 0.017 %  | 6.562 ± 0.055 %  |
+| q4_1         | None    |             4.78 | 6.682737 ±   0.041285  | 0.451103 ±   0.008030  | 0.071683 ±   0.000505 | -0.927 ± 0.017 %  | 8.512 ± 0.063 %  |
+| q4_0         | None    |             4.34 | 6.700147 ±   0.041226  | 0.468514 ±   0.007951  | 0.071940 ±   0.000491 | -1.588 ± 0.022 %  | 8.434 ± 0.061 %  |
+| q3_K_L       | WT 10m  |             4.03 | 6.671223 ±   0.041427  | 0.439590 ±   0.008154  | 0.073077 ±   0.000529 | -0.940 ± 0.023 %  | 8.662 ± 0.064 %  |
+| q3_K_M       | WT 10m  |             3.74 | 6.734255 ±   0.041838  | 0.502622 ±   0.008901  | 0.084358 ±   0.000588 | -1.198 ± 0.024 %  | 9.292 ± 0.065 %  |
+| q3_K_L       | None    |             4.03 | 6.787876 ±   0.042104  | 0.556242 ±   0.009171  | 0.087176 ±   0.000614 | -1.532 ± 0.025 %  | 9.432 ± 0.067 %  |
+| q3_K_M       | None    |             3.74 | 6.888498 ±   0.042669  | 0.656864 ±   0.010071  | 0.101913 ±   0.000677 | -1.990 ± 0.026 %  | 10.203 ± 0.068 % |
+| iq3_M        | WT 10m  |             3.53 | 6.898327 ±   0.041643  | 0.666694 ±   0.009449  | 0.102534 ±   0.000663 | -3.178 ± 0.026 %  | 10.513 ± 0.066 % |
+| iq3_S        | WT 10m  |             3.42 | 6.965501 ±   0.042406  | 0.733867 ±   0.010245  | 0.111278 ±   0.000710 | -3.066 ± 0.027 %  | 10.845 ± 0.068 % |
+| iq3_XS       | WT 10m  |             3.28 | 7.163043 ±   0.043772  | 0.931409 ±   0.012084  | 0.138693 ±   0.000857 | -3.667 ± 0.031 %  | 12.148 ± 0.070 % |
+| iq3_XXS      | WT 10m  |             3.05 | 7.458436 ±   0.046404  | 1.226803 ±   0.015234  | 0.183625 ±   0.001042 | -3.918 ± 0.035 %  | 13.836 ± 0.074 % |
+| q3_K_S       | WT 10m  |             3.41 | 7.602878 ±   0.046848  | 1.371244 ±   0.015688  | 0.199821 ±   0.001008 | -5.046 ± 0.037 %  | 14.980 ± 0.070 % |
+| q3_K_S       | None    |             3.41 | 7.863786 ±   0.048885  | 1.632152 ±   0.017733  | 0.228217 ±   0.001079 | -5.604 ± 0.038 %  | 15.541 ± 0.070 % |
+| iq2_M        | WT 10m  |             2.74 | 8.600799 ±   0.055124  | 2.369166 ±   0.025244  | 0.325989 ±   0.00160  | -6.463 ± 0.046 %  | 18.519 ± 0.080 % |
+| q2_K         | WT 10k  |             2.96 | 8.652290 ±   0.055572  | 2.420657 ±   0.025587  | 0.331393 ±   0.001562 | -6.606 ± 0.046 %  | 18.790 ± 0.078 % |
+| q2_K         | WT 100k |             2.96 | 8.641993 ±   0.055406  | 2.410359 ±   0.025495  | 0.331672 ±   0.001569 | -6.628 ± 0.047 %  | 18.856 ± 0.078 % |
+| q2_K         | WT 10m  |             2.96 | 8.647825 ±   0.055610  | 2.416191 ±   0.025683  | 0.332223 ±   0.001572 | -6.500 ± 0.047 %  | 18.881 ± 0.078 % |
+| q2_K         | WT 1m   |             2.96 | 8.674365 ±   0.055743  | 2.442732 ±   0.025843  | 0.335308 ±   0.001576 | -6.634 ± 0.047 %  | 19.009 ± 0.079 % |
+| q2_K         | WT 1k   |             2.96 | 8.682605 ±   0.055916  | 2.450972 ±   0.026069  | 0.337093 ±   0.001596 | -6.596 ± 0.047 %  | 18.977 ± 0.079 % |
+| q2_K_S       | WT 10m  |             2.96 | 9.323778 ±   0.061551  | 3.092145 ±   0.031914  | 0.403360 ±   0.001787 | -7.131 ± 0.049 %  | 20.050 ± 0.081 % |
+| q2_K_S       | WT 1m   |             2.96 | 9.329321 ±   0.061378  | 3.097688 ±   0.031816  | 0.403590 ±   0.001797 | -7.289 ± 0.049 %  | 20.123 ± 0.081 % |
+| q2_K_S       | WT 100k |             2.96 | 9.362973 ±   0.061740  | 3.131339 ±   0.032169  | 0.408367 ±   0.001802 | -7.198 ± 0.050 %  | 20.132 ± 0.081 % |
+| q2_K_S       | WT 10k  |             2.96 | 9.376479 ±   0.062045  | 3.144846 ±   0.032464  | 0.408662 ±   0.001819 | -7.141 ± 0.050 %  | 20.120 ± 0.081 % |
+| q2_K_S       | WT 1k   |             2.96 | 9.415200 ±   0.062475  | 3.183567 ±   0.032993  | 0.415865 ±   0.001846 | -7.153 ± 0.050 %  | 20.311 ± 0.082 % |
+| iq2_S        | WT 10m  |             2.56 | 9.650781 ±   0.063209  | 3.419148 ±   0.034017  | 0.439197 ±   0.001976 | -8.319 ± 0.052 %  | 21.491 ± 0.083 % |
+| q2_K         | None    |             2.96 | 9.751568 ±   0.063312  | 3.519934 ±   0.033863  | 0.445132 ±   0.001835 | -9.123 ± 0.051 %  | 21.421 ± 0.079 % |
+| iq2_XS       | WT 10m  |             2.43 | 10.761424 ±   0.071056 | 4.529791 ±   0.042229  | 0.546290 ±   0.002133 | -10.576 ± 0.056 % | 23.872 ± 0.082 % |
+| iq2_XXS      | WT 10m  |             2.24 | 14.091782 ±   0.098396 | 7.860148 ±   0.070752  | 0.812022 ±   0.002741 | -14.363 ± 0.065 % | 28.576 ± 0.084 % |
+| iq1_M        | WT 10m  |             2.01 | 25.493722 ±   0.177903 | 19.262089 ±   0.152396 | 1.393084 ±   0.003529 | -24.672 ± 0.077 % | 38.287 ± 0.084 % |
+| iq1_S        | WT 1m   |             1.88 | 58.097760 ±   0.438604 | 51.866126 ±   0.416604 | 2.211278 ±   0.004688 | -32.471 ± 0.087 % | 46.418 ± 0.085 % |
+| iq1_S        | WT 1k   |             1.88 | 58.267851 ±   0.446208 | 52.036218 ±   0.424373 | 2.214858 ±   0.004778 | -31.880 ± 0.089 % | 46.330 ± 0.086 % |
+| iq1_S        | WT 100k |             1.88 | 58.581498 ±   0.453145 | 52.349864 ±   0.431360 | 2.220834 ±   0.004818 | -32.261 ± 0.089 % | 46.002 ± 0.086 % |
+| iq1_S        | WT 10m  |             1.88 | 60.694593 ±   0.471290 | 54.462959 ±   0.449644 | 2.254554 ±   0.004868 | -31.973 ± 0.088 % | 46.271 ± 0.086 % |
+| iq1_S        | WT 10k  |             1.88 | 63.221324 ±   0.493077 | 56.989691 ±   0.471423 | 2.293527 ±   0.004885 | -32.261 ± 0.089 % | 46.562 ± 0.086 % |
+
+There seems to be no consistent improvement from using more Wikitext tokens for the importance matrix.
+K-quants score better on mean Δp than the legacy quants than e.g. KL divergence would suggest.
+
+## LLaMA 2 vs. LLaMA 3 Quantization comparison
+
+| Revision | f364eb6f           |
+|:---------|:-------------------|
+| Backend  | CUDA               |
+| CPU      | AMD Epyc 7742      |
+| GPU      | 1x NVIDIA RTX 4090 |
+
+| Metric          |          L2 7b q2_K |          L3 8b q2_K |        L2 7b q4_K_M |        L3 8b q4_K_M |          L2 7b q6_K |          L3 8b q6_K |          L2 7b q8_0 |          L3 8b q8_0 |
+|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
+| Mean PPL        | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
+| Mean PPL ratio  | 1.107955 ± 0.001427 | 1.564849 ± 0.004525 | 1.014242 ± 0.000432 | 1.028160 ± 0.000723 | 1.002406 ± 0.000191 | 1.003490 ± 0.000296 | 1.000689 ± 0.000107 | 1.000425 ± 0.000161 |
+| Mean ΔPPL       | 0.625552 ± 0.008725 | 3.519934 ± 0.033863 | 0.082526 ± 0.002530 | 0.175482 ± 0.004620 | 0.013941 ± 0.001110 | 0.021748 ± 0.001852 | 0.003990 ± 0.000624 | 0.002650 ± 0.001006 |
+| PPL correlation |              97.36% |              89.62% |              99.71% |              99.34% |              99.94% |              99.88% |              99.98% |              99.96% |
+| Mean KLD        | 0.108903 ± 0.000645 | 0.445132 ± 0.001835 | 0.012686 ± 0.000079 | 0.031273 ± 0.000238 | 0.002098 ± 0.000014 | 0.005452 ± 0.000035 | 0.000369 ± 0.000007 | 0.001355 ± 0.000006 |
+| Mean Δp         |    -2.710 ± 0.023 % |    -9.123 ± 0.051 % |    -0.416 ± 0.008 % |    -0.596 ± 0.014 % |    -0.035 ± 0.003 % |    -0.007 ± 0.006 % |    -0.005 ± 0.002 % |    -0.019 ± 0.003 % |
+| Maximum Δp      |             85.136% |             94.268% |             45.209% |             95.054% |             23.593% |             53.601% |             43.925% |             28.734% |
+| 99.9% Δp        |             37.184% |             50.003% |             17.461% |             27.084% |              7.798% |             13.613% |              3.387% |              6.402% |
+| 99.0% Δp        |             18.131% |             25.875% |              7.798% |             12.084% |              3.838% |              6.407% |              1.867% |              3.544% |
+| Median Δp       |             -0.391% |             -2.476% |             -0.026% |             -0.024% |             -0.001% |              0.000% |             -0.000% |             -0.000% |
+| 1.0% Δp         |            -39.762% |            -87.173% |            -11.433% |            -19.567% |             -4.222% |             -6.767% |             -1.862% |             -3.698% |
+| 0.1% Δp         |            -79.002% |            -98.897% |            -26.433% |            -56.054% |             -9.091% |            -16.584% |             -3.252% |             -6.579% |
+| Minimum Δp      |            -99.915% |            -99.965% |            -83.383% |            -98.699% |            -43.142% |            -68.487% |             -9.343% |            -24.301% |
+| RMS Δp          |     9.762 ± 0.053 % |    21.421 ± 0.079 % |     3.252 ± 0.024 % |     5.519 ± 0.050 % |     1.339 ± 0.010 % |     2.295 ± 0.019 % |     0.618 ± 0.011 % |     1.198 ± 0.007 % |
+| Same top p      |    85.584 ± 0.086 % |    71.138 ± 0.119 % |    94.665 ± 0.055 % |    91.901 ± 0.072 % |    97.520 ± 0.038 % |    96.031 ± 0.051 % |    98.846 ± 0.026 % |    97.674 ± 0.040 % |
+
+## LLaMA 3 BF16 vs. FP16 comparison
+
+| Revision | 83330d8c      |
+|:---------|:--------------|
+| Backend  | CPU           |
+| CPU      | AMD Epyc 7742 |
+| GPU      | N/A           |
+
+Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA 3 8b FP16 as the `--model` for comparison.
+
+| Metric                         |                    Value |
+|--------------------------------|--------------------------|
+| Mean PPL(Q)                    |      6.227711 ± 0.037833 |
+| Mean PPL(base)                 |      6.225194 ± 0.037771 |
+| Cor(ln(PPL(Q)), ln(PPL(base))) |                  99.990% |
+| Mean ln(PPL(Q)/PPL(base))      |      0.000404 ± 0.000086 |
+| Mean PPL(Q)/PPL(base)          |      1.000404 ± 0.000086 |
+| Mean PPL(Q)-PPL(base)          |      0.002517 ± 0.000536 |
+| Mean    KLD                    |  0.00002515 ± 0.00000020 |
+| Maximum KLD                    |                 0.012206 |
+| 99.9%   KLD                    |                 0.000799 |
+| 99.0%   KLD                    |                 0.000222 |
+| 99.0%   KLD                    |                 0.000222 |
+| Median  KLD                    |                 0.000013 |
+| 10.0%   KLD                    |                -0.000002 |
+| 5.0%   KLD                     |                -0.000008 |
+| 1.0%   KLD                     |                -0.000023 |
+| Minimum KLD                    |                -0.000059 |
+| Mean    Δp                     | -0.0000745 ± 0.0003952 % |
+| Maximum Δp                     |                   4.186% |
+| 99.9%   Δp                     |                   1.049% |
+| 99.0%   Δp                     |                   0.439% |
+| 95.0%   Δp                     |                   0.207% |
+| 90.0%   Δp                     |                   0.125% |
+| 75.0%   Δp                     |                   0.029% |
+| Median  Δp                     |                   0.000% |
+| 25.0%   Δp                     |                  -0.030% |
+| 10.0%   Δp                     |                  -0.126% |
+| 5.0%   Δp                      |                  -0.207% |
+| 1.0%   Δp                      |                  -0.434% |
+| 0.1%   Δp                      |                  -1.016% |
+| Minimum Δp                     |                  -4.672% |
+| RMS Δp                         |          0.150 ± 0.001 % |
+| Same top p                     |         99.739 ± 0.013 % |
+
+## Old Numbers
+
+<details>
+<summary>Llama 2 70B Scoreboard</summary>
+
+| Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
+|--------------|------------------|------------|---------------|
+| Q4_0         | 36.20            | 3.5550     | 3.61%         |
+| Q4_1         | 40.20            | 3.5125     | 2.37%         |
+| Q5_0         | 44.20            | 3.4744     | 1.26%         |
+| Q2_K         | 27.27            | 3.7339     | 8.82%         |
+| Q3_K_S       | 27.86            | 3.7019     | 7.89%         |
+| Q3_K_M       | 30.83            | 3.5932     | 4.72%         |
+| Q3_K_L       | 33.67            | 3.5617     | 3.80%         |
+| Q4_K_S       | 36.39            | 3.4852     | 1.57%         |
+| Q4_K_M       | 38.54            | 3.4725     | 1.20%         |
+| Q5_K_S       | 44.20            | 3.4483     | 0.50%         |
+| Q5_K_M       | 45.41            | 3.4451     | 0.40%         |
+| Q6_K         | 52.70            | 3.4367     | 0.16%         |
+| fp16         | 128.5            | 3.4313     | -             |
+
+</details>
diff --git a/llama.cpp/tools/perplexity/perplexity.cpp b/llama.cpp/tools/perplexity/perplexity.cpp
new file mode 100644
index 0000000..1ead9c8
--- /dev/null
+++ b/llama.cpp/tools/perplexity/perplexity.cpp
@@ -0,0 +1,2070 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+
+#include <chrono>
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <mutex>
+#include <random>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+struct results_perplexity {
+    std::vector<llama_token> tokens;
+    double                   ppl_value;
+    std::vector<float>       logits;
+    std::vector<float>       probs;
+};
+
+struct results_log_softmax {
+    double log_softmax;
+    float  logit;
+    float  prob;
+};
+
+static std::vector<float> softmax(const std::vector<float>& logits) {
+    std::vector<float> probs(logits.size());
+    float max_logit = logits[0];
+    for (float v : logits) {
+        max_logit = std::max(max_logit, v);
+    }
+    double sum_exp = 0.0;
+    for (size_t i = 0; i < logits.size(); i++) {
+        // Subtract the maximum logit value from the current logit value for numerical stability
+        const float logit = logits[i] - max_logit;
+        const float exp_logit = expf(logit);
+        sum_exp += exp_logit;
+        probs[i] = exp_logit;
+    }
+    for (size_t i = 0; i < probs.size(); i++) {
+        probs[i] /= sum_exp;
+    }
+    return probs;
+}
+
+static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
+    float max_logit = logits[0];
+    for (int i = 1; i < n_vocab; ++i) {
+        max_logit = std::max(max_logit, logits[i]);
+    }
+    double sum_exp = 0.0;
+    for (int i = 0; i < n_vocab; ++i) {
+        sum_exp += expf(logits[i] - max_logit);
+    }
+    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
+}
+
+static inline int nearest_int(float fval) {
+    //assert(fval <= 4194303.f);
+    float val = fval + 12582912.f;
+    int i; memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
+static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob, int tok) {
+    float max_logit = logits[0];
+    float min_logit = logits[0];
+    for (int i = 1; i < n_vocab; ++i) {
+        max_logit = std::max(max_logit, logits[i]);
+        min_logit = std::min(min_logit, logits[i]);
+    }
+    min_logit = std::max(min_logit, max_logit - 16);
+    double sum_exp = 0.0;
+    for (int i = 0; i < n_vocab; ++i) {
+        sum_exp += expf(logits[i] - max_logit);
+    }
+    const float log_sum_exp = log(sum_exp);
+    const float min_log_prob = min_logit - max_logit - log_sum_exp;
+    const float scale = (max_logit - min_logit)/65535.f;
+    float * d = (float *)log_prob;
+    d[0] = scale;
+    d[1] = min_log_prob;
+    log_prob += 4;
+    if (scale) {
+        const float inv_scale = 1/scale;
+        for (int i = 0; i < n_vocab; ++i) {
+            log_prob[i] = logits[i] > min_logit ? nearest_int(inv_scale*(logits[i] - min_logit)) : 0;
+        }
+    } else {
+        std::memset(log_prob, 0, n_vocab*sizeof(uint16_t));
+    }
+    return max_logit + log_sum_exp - logits[tok];
+}
+
+static void process_logits(
+    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
+    double & nll, double & nll2, float * logit_history, float * prob_history
+) {
+    std::mutex mutex;
+    int counter = 0;
+    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
+        double local_nll  = 0;
+        double local_nll2 = 0;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int i = counter++;
+            if (i >= n_token) {
+                nll += local_nll; nll2 += local_nll2;
+                break;
+            }
+            lock.unlock();
+            const results_log_softmax results = log_softmax(n_vocab, logits + size_t(i)*n_vocab, tokens[i+1]);
+            const double v = -results.log_softmax;
+            local_nll += v;
+            local_nll2 += v*v;
+
+            logit_history[i] = results.logit;
+            prob_history[i]  = results.prob;
+        }
+    };
+    for (auto & w : workers) {
+        w = std::thread(compute);
+    }
+    compute();
+    for (auto & w : workers) {
+        w.join();
+    }
+}
+
+static void process_logits(std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token,
+        std::vector<std::thread> & workers, std::vector<uint16_t> & log_probs, double & nll, double & nll2) {
+    std::mutex mutex;
+    const int nv = 2*((n_vocab + 1)/2) + 4;
+    int counter = 0;
+    auto compute = [&mutex, &counter, &log_probs, &nll, &nll2, n_vocab, logits, tokens, n_token, nv] () {
+        double local_nll  = 0;
+        double local_nll2 = 0;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int i = counter++;
+            if (i >= n_token) {
+                nll += local_nll; nll2 += local_nll2;
+                break;
+            }
+            lock.unlock();
+            const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
+            local_nll += v;
+            local_nll2 += v*v;
+        }
+    };
+    for (auto & w : workers) {
+        w = std::thread(compute);
+    }
+    compute();
+    for (auto & w : workers) {
+        w.join();
+    }
+    out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t));
+}
+
+struct kl_divergence_result {
+    double sum_nll          = 0.0;
+    double sum_nll2         = 0.0;
+    double sum_nll_base     = 0.0;
+    double sum_nll_base2    = 0.0;
+    double sum_nll_nll_base = 0.0;
+    double sum_kld          = 0.0;
+    double sum_kld2         = 0.0;
+    double sum_p_diff       = 0.0;
+    double sum_p_diff2      = 0.0;
+    double sum_p_diff4      = 0.0;
+    float  max_p_diff       = 0.0f;
+    size_t n_same_top       = 0.0;
+    size_t count            = 0.0;
+};
+
+static std::pair<double, float> log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
+    float max_logit = logits[0];
+    int imax = 0;
+    for (int i = 1; i < n_vocab; ++i) {
+        if (logits[i] > max_logit) {
+            max_logit = logits[i];
+            imax = i;
+        }
+    }
+    double sum_exp = 0.0;
+    for (int i = 0; i < n_vocab; ++i) {
+        sum_exp += expf(logits[i] - max_logit);
+    }
+    const float log_sum_exp = log(sum_exp);
+    const float * d = (const float *)base_log_prob;
+    const float scale = d[0];
+    const float min_log_prob = d[1];
+    base_log_prob += 4;
+
+    const float nll = max_logit + log_sum_exp - logits[tok];
+    kld.sum_nll  += nll;
+    kld.sum_nll2 += nll*nll;
+
+    const float nll_base = -(scale*base_log_prob[tok] + min_log_prob);
+    kld.sum_nll_base  += nll_base;
+    kld.sum_nll_base2 += nll_base*nll_base;
+
+    kld.sum_nll_nll_base += nll*nll_base;
+
+    max_logit += log_sum_exp;
+    double sum = 0;
+    int imax_base = -1;
+    float p_log_base_max = 0;
+    for (int i = 0; i < n_vocab; ++i) {
+        const float p_log_base = scale*base_log_prob[i] + min_log_prob;
+        if (i == 0 || p_log_base > p_log_base_max) {
+            p_log_base_max = p_log_base;
+            imax_base = i;
+        }
+        if (p_log_base > -16.f) {
+            const float p_base = expf(p_log_base);
+            sum += p_base * (p_log_base - logits[i] + max_logit);
+        }
+    }
+    kld.sum_kld  += sum;
+    kld.sum_kld2 += sum*sum;
+    ++kld.count;
+    if (imax == imax_base) {
+        ++kld.n_same_top;
+    }
+
+    const float p_base = expf(-nll_base);
+    const float p = expf(-nll);
+    const float p_diff = p - p_base;
+    kld.sum_p_diff  += p_diff;
+    const double p_diff2 = p_diff*p_diff;
+    kld.sum_p_diff2 += p_diff2;
+    kld.sum_p_diff4 += p_diff2*p_diff2;
+    kld.max_p_diff = std::max(kld.max_p_diff, std::fabs(p_diff));
+
+    return std::make_pair(sum, p_diff);
+}
+
+static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
+        std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
+        float * kld_values, float * p_diff_values) {
+    std::mutex mutex;
+    const int nv = 2*((n_vocab + 1)/2) + 4;
+    int counter = 0;
+    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values, p_diff_values] () {
+        kl_divergence_result local_kld;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int i = counter++;
+            if (i >= n_token) {
+                kld.sum_nll          += local_kld.sum_nll;
+                kld.sum_nll2         += local_kld.sum_nll2;
+                kld.sum_nll_base     += local_kld.sum_nll_base;
+                kld.sum_nll_base2    += local_kld.sum_nll_base2;
+                kld.sum_nll_nll_base += local_kld.sum_nll_nll_base;
+                kld.sum_kld          += local_kld.sum_kld;
+                kld.sum_kld2         += local_kld.sum_kld2;
+                kld.sum_p_diff       += local_kld.sum_p_diff;
+                kld.sum_p_diff2      += local_kld.sum_p_diff2;
+                kld.sum_p_diff4      += local_kld.sum_p_diff4;
+                kld.n_same_top       += local_kld.n_same_top;
+                kld.max_p_diff        = std::max(kld.max_p_diff, local_kld.max_p_diff);
+                kld.count            += local_kld.count;
+                break;
+            }
+            lock.unlock();
+            std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
+            kld_values[i]    = (float)v.first;
+            p_diff_values[i] = v.second;
+        }
+    };
+    for (auto & w : workers) {
+        w = std::thread(compute);
+    }
+    compute();
+    for (auto & w : workers) {
+        w.join();
+    }
+}
+
+static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) {
+    // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+    // Output: `perplexity: 13.5106 [114/114]`
+    // BOS tokens will be added for each chunk before eval
+
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
+    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
+
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
+
+    const int n_ctx = llama_n_ctx(ctx);
+
+    if (int(tokens.size()) < 2*n_ctx) {
+        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+                n_ctx);
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        return {std::move(tokens), 0., {}, {}};
+    }
+
+    std::vector<float> logit_history;
+    std::vector<float> prob_history;
+
+    logit_history.resize(tokens.size());
+    prob_history.resize(tokens.size());
+
+    if (params.ppl_stride <= 0) {
+        LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+        return {tokens, -1, logit_history, prob_history};
+    }
+
+    const int calc_chunk = n_ctx;
+
+    LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
+
+    if (int(tokens.size()) <= calc_chunk) {
+        LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
+                tokens.size(), n_ctx, params.ppl_stride);
+        return {tokens, -1, logit_history, prob_history};
+    }
+
+    const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1)  / params.ppl_stride;
+
+    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
+    const int n_batch = params.n_batch;
+
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
+    int count = 0;
+    double nll = 0.0;
+
+    LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+
+    for (int i = 0; i < n_chunk; ++i) {
+        const int start =     i * params.ppl_stride;
+        const int end   = start + calc_chunk;
+
+        const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
+        //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
+
+        std::vector<float> logits;
+
+        const auto t_start = std::chrono::high_resolution_clock::now();
+
+        // clear the KV cache
+        llama_memory_clear(llama_get_memory(ctx), true);
+
+        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
+        for (int j = 0; j < num_batches; ++j) {
+            const int batch_start = start + j * n_batch;
+            const int batch_size  = std::min(end - batch_start, n_batch);
+
+            common_batch_clear(batch);
+            for (int i = 0; i < batch_size; i++) {
+                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            }
+
+            //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
+            if (llama_decode(ctx, batch)) {
+                //LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
+                return {tokens, -1, logit_history, prob_history};
+            }
+
+            // save original token and restore it after eval
+            const auto token_org = tokens[batch_start];
+
+            // add BOS token for the first batch of each chunk
+            if (add_bos && j == 0) {
+                tokens[batch_start] = llama_vocab_bos(vocab);
+            }
+
+            const auto * batch_logits = llama_get_logits(ctx);
+            logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
+
+            if (j == 0) {
+                tokens[batch_start] = token_org;
+            }
+        }
+
+        llama_batch_free(batch);
+
+        const auto t_end = std::chrono::high_resolution_clock::now();
+
+        if (i == 0) {
+            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            int total_seconds = (int)(t_total * n_chunk);
+            if (total_seconds >= 60*60) {
+                LOG("%d hours ", total_seconds / (60*60));
+                total_seconds = total_seconds % (60*60);
+            }
+            LOG("%.2f minutes\n", total_seconds / 60.0);
+        }
+
+        //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
+        for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
+            // Calculate probability of next token, given the previous ones.
+            const std::vector<float> tok_logits(
+                logits.begin() + size_t(j + 0) * n_vocab,
+                logits.begin() + size_t(j + 1) * n_vocab);
+
+            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+            logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
+            prob_history[start + j + 1]  = prob;
+
+            nll += -std::log(prob);
+            ++count;
+        }
+        // perplexity is e^(average negative log-likelihood)
+        if (params.ppl_output_type == 0) {
+            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        } else {
+            LOG("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+        }
+    }
+    LOG("\n");
+
+    return {tokens, std::exp(nll / count), logit_history, prob_history};
+}
+
+static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
+    if (params.ppl_stride > 0) {
+        return perplexity_v2(ctx, params);
+    }
+
+    // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    // Run `./llama-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+    // Output: `perplexity: 13.5106 [114/114]`
+    // BOS tokens will be added for each chunk before eval
+
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
+    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+
+    std::ofstream logits_stream;
+    if (!params.logits_file.empty()) {
+        logits_stream.open(params.logits_file.c_str(), std::ios::binary);
+        if (!logits_stream.is_open()) {
+            LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
+            return {};
+        }
+        LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
+        logits_stream.write("_logits_", 8);
+        logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
+    }
+
+    auto tim1 = std::chrono::high_resolution_clock::now();
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
+
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
+
+    auto tim2 = std::chrono::high_resolution_clock::now();
+    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+
+    if (int(tokens.size()) < 2*n_ctx) {
+        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+                n_ctx);
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        return {std::move(tokens), 0., {}, {}};
+    }
+
+    std::vector<float> logit_history;
+    logit_history.resize(tokens.size());
+
+    std::vector<float> prob_history;
+    prob_history.resize(tokens.size());
+
+    const int n_chunk_max = tokens.size() / n_ctx;
+
+    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
+    const int n_batch = params.n_batch;
+
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
+    int count = 0;
+    double nll = 0.0;
+    double nll2 = 0.0;
+
+    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+    const int n_seq = std::max(1, n_batch / n_ctx);
+
+    GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
+    GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
+
+    llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
+
+    std::vector<float> logits;
+    if (num_batches > 1) {
+        logits.reserve(size_t(n_ctx) * n_vocab);
+    }
+
+    LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
+
+    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
+
+    std::vector<uint16_t> log_probs;
+    if (!params.logits_file.empty()) {
+        logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
+        logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
+        logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
+        const int nv = 2*((n_vocab + 1)/2) + 4;
+        log_probs.resize(n_ctx * nv);
+    }
+
+    // We get the logits for all the tokens in the context window (params.n_ctx)
+    // from llama_decode below.  Now, based on https://huggingface.co/docs/transformers/perplexity,
+    // calculate the perplexity over the last half of the window (so the model always has
+    // some context to predict the token).
+    //
+    // We rely on the fact that attention in the forward pass only looks at previous
+    // tokens here, so the logits returned for each token are an accurate representation
+    // of what the model would have predicted at that point.
+    //
+    // Example, we have a context window of 512, we will compute perplexity for each of the
+    // last 256 tokens.  Then, we split the input up into context window size chunks to
+    // process the entire prompt.
+    const int first = n_ctx/2;
+
+    for (int i = 0; i < n_chunk; i += n_seq) {
+        const int start =     i * n_ctx;
+        const int end   = start + n_ctx;
+
+        const int n_seq_batch = std::min(n_seq, n_chunk - i);
+
+        const auto t_start = std::chrono::high_resolution_clock::now();
+
+        // clear the KV cache
+        llama_memory_clear(llama_get_memory(ctx), true);
+
+        for (int j = 0; j < num_batches; ++j) {
+            const int batch_start = start + j * n_batch;
+            const int batch_size  = std::min(end - batch_start, n_batch);
+
+            int n_outputs = 0;
+
+            batch.n_tokens = 0;
+            for (int seq = 0; seq < n_seq_batch; seq++) {
+                int seq_start = batch_start + seq*n_ctx;
+
+                // save original token and restore it after decode
+                const auto token_org = tokens[seq_start];
+
+                // add BOS token for the first batch of each chunk
+                if (add_bos && j == 0) {
+                    tokens[seq_start] = llama_vocab_bos(vocab);
+                }
+
+                for (int k = 0; k < batch_size; ++k) {
+                    const int idx = seq*n_ctx + k;
+                    batch.token   [idx]    = tokens[seq_start + k];
+                    batch.pos     [idx]    = j*n_batch + k;
+                    batch.n_seq_id[idx]    = 1;
+                    batch.seq_id  [idx][0] = seq;
+                    batch.logits  [idx]    = batch.pos[idx] >= first ? 1 : 0;
+
+                    n_outputs += batch.logits[idx] != 0;
+                }
+                batch.n_tokens += batch_size;
+
+                // restore the original token in case it was set to BOS
+                tokens[seq_start] = token_org;
+            }
+
+            if (llama_decode(ctx, batch)) {
+                LOG_INF("%s : failed to decode\n", __func__);
+                return {tokens, -1, logit_history, prob_history};
+            }
+
+            if (num_batches > 1 && n_outputs > 0) {
+                const auto * batch_logits = llama_get_logits(ctx);
+                logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab);
+            }
+        }
+
+
+        if (i == 0) {
+            llama_synchronize(ctx);
+            const auto t_end = std::chrono::high_resolution_clock::now();
+            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            int total_seconds = (int)(t_total*n_chunk/n_seq);
+            if (total_seconds >= 60*60) {
+                LOG("%d hours ", total_seconds / (60*60));
+                total_seconds = total_seconds % (60*60);
+            }
+            LOG("%.2f minutes\n", total_seconds / 60.0);
+        }
+
+        for (int seq = 0; seq < n_seq_batch; seq++) {
+            const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
+
+            llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
+            if (!params.logits_file.empty()) {
+                process_logits(logits_stream, n_vocab, all_logits,
+                        tokens_data, n_ctx - 1 - first,
+                        workers, log_probs, nll, nll2);
+            } else {
+                process_logits(n_vocab, all_logits,
+                        tokens_data, n_ctx - 1 - first,
+                        workers, nll, nll2,
+                        logit_history.data() + start + seq*n_ctx + first,
+                        prob_history.data()  + start + seq*n_ctx + first);
+            }
+            count += n_ctx - first - 1;
+
+            // perplexity is e^(average negative log-likelihood)
+            if (params.ppl_output_type == 0) {
+                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+            } else {
+                double av = nll/count;
+                double av2 = nll2/count - av*av;
+                if (av2 > 0) {
+                    av2 = sqrt(av2/(count-1));
+                }
+                LOG("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+            }
+        }
+
+        logits.clear();
+    }
+    LOG("\n");
+
+    nll2 /= count;
+    nll /= count;
+    const double ppl = exp(nll);
+    nll2 -= nll * nll;
+    if (nll2 > 0) {
+        nll2 = sqrt(nll2/(count-1));
+        LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+    } else {
+        LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
+    }
+
+    llama_batch_free(batch);
+
+    return {tokens, ppl, logit_history, prob_history};
+}
+
+static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
+    int prev_outputs = 0;
+    for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
+        const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
+
+        llama_batch batch_view = {
+            n_tokens,
+            batch.token    + i,
+            nullptr,
+            batch.pos      + i,
+            batch.n_seq_id + i,
+            batch.seq_id   + i,
+            batch.logits   + i,
+        };
+
+        const int ret = llama_decode(ctx, batch_view);
+        if (ret != 0) {
+            LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+            return false;
+        }
+
+        int n_outputs = 0;
+        for (int i = 0; i < n_tokens; ++i) {
+            n_outputs += batch_view.logits[i] != 0;
+        }
+
+        memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
+
+        prev_outputs += n_outputs;
+    }
+
+    return true;
+}
+
+#define K_TOKEN_CHUNK 4
+
+static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
+        const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
+    if (eval_results.size() != eval_pairs.size()) {
+        eval_results.resize(eval_pairs.size());
+    }
+    if (eval_pairs.empty()) {
+        return;
+    }
+
+    size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
+
+    std::atomic<int> counter(0);
+    auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
+        float local_logprobs[K_TOKEN_CHUNK];
+        while (true) {
+            const size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
+            if (first >= eval_results.size()) {
+                break;
+            }
+            const size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
+            for (size_t i = first; i < last; ++i) {
+                const auto * logits = batch_logits + eval_pairs[i].first * n_vocab;
+                float max_logit = logits[0];
+                for (int j = 1; j < n_vocab; ++j) {
+                    max_logit = std::max(max_logit, logits[j]);
+                }
+                float sum_p = 0.f;
+                for (int j = 0; j < n_vocab; ++j) {
+                    sum_p += expf(logits[j] - max_logit);
+                }
+                local_logprobs[i - first] = logits[eval_pairs[i].second] - max_logit - std::log(sum_p);
+            }
+            std::memcpy(eval_results.data() + first, local_logprobs, (last - first)*sizeof(float));
+        }
+    };
+
+    for (size_t it = 0; it < max_threads; ++it) {
+        workers[it] = std::thread(compute);
+    }
+    for (size_t it = 0; it < max_threads; ++it) {
+        workers[it].join();
+    }
+}
+
+static void hellaswag_score(llama_context * ctx, const common_params & params) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    // Calculates hellaswag score (acc_norm) from prompt
+    //
+    // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
+    // All used data fields are preprocessed as in https://github.com/EleutherAI/lm-evaluation-harness/blob/df3da98c5405deafd519c2ddca52bb7c3fe36bef/lm_eval/tasks/hellaswag.py#L62-L68
+    //
+    // All 10042 tasks should be extracted to keep the results standardized like other implementations.
+    //
+    // Datafile layout:
+    // ['??'] denotes json fields
+    // 6 lines per task:
+    // ['activity_label'] + ": " +['ctx']  - The first part of the query, the context
+    // ['label'] - The index the best common sense ending aka gold ending
+    // ['endings'][0] - Endings added to the first part of the query
+    // ['endings'][1]
+    // ['endings'][2]
+    // ['endings'][3]
+
+    std::vector<std::string> prompt_lines;
+    std::istringstream strstream(params.prompt);
+    std::string line;
+
+    while (std::getline(strstream,line,'\n')) {
+        prompt_lines.push_back(line);
+    }
+
+    if (prompt_lines.size() % 6 != 0) {
+        LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
+        return;
+    }
+
+    size_t hs_task_count = prompt_lines.size()/6;
+    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
+
+    const bool is_spm = llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_SPM;
+    LOG_INF("================================= is_spm = %d\n", is_spm);
+
+    // The tasks should be randomized so the score stabilizes quickly.
+    bool randomize_tasks = true;
+
+    // Number of tasks to use when computing the score
+    if (params.hellaswag_tasks < hs_task_count) {
+        hs_task_count = params.hellaswag_tasks;
+    }
+
+    // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
+    std::mt19937 rng(1);
+
+    // Dataholder for hellaswag tasks
+    struct hs_data_t {
+        std::string context;
+        size_t gold_ending_idx;
+        std::string ending[4];
+        size_t ending_logprob_count[4];
+        double ending_logprob[4];
+
+        size_t i_logits;        // starting index of logits in the llama_batch
+        size_t common_prefix;   // max number of initial tokens that are the same in all sentences
+        size_t required_tokens; // needed number of tokens to evaluate all 4 endings
+        std::vector<llama_token> seq_tokens[4];
+    };
+
+    LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
+
+    // Select and read data from prompt lines
+    std::vector<hs_data_t> hs_data(hs_task_count);
+    for (size_t i = 0; i < hs_task_count; i++) {
+        size_t idx = i;
+
+        auto & hs_cur = hs_data[i];
+
+        // Select a random example of those left in the prompt
+        if (randomize_tasks) {
+            std::uniform_int_distribution<size_t> dist(0, prompt_lines.size()/6-1 ) ;
+            idx = dist(rng);
+        }
+
+        hs_cur.context = prompt_lines[idx*6];
+        hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
+        for (size_t j = 0; j < 4; j++) {
+            hs_cur.ending[j] = prompt_lines[idx*6+2+j];
+            hs_cur.seq_tokens[j] = common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
+        }
+
+        // determine the common prefix of the endings
+        hs_cur.common_prefix = 0;
+        for (size_t k = 0; k < hs_cur.seq_tokens[0].size(); k++) {
+            if (hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[1][k] ||
+                hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[2][k] ||
+                hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[3][k]) {
+                break;
+            }
+            hs_cur.common_prefix++;
+        }
+        hs_cur.required_tokens = hs_cur.common_prefix +
+            hs_cur.seq_tokens[0].size() - hs_cur.common_prefix +
+            hs_cur.seq_tokens[1].size() - hs_cur.common_prefix +
+            hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
+            hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
+
+        //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());
+
+        // Delete the selected random example from the prompt
+        if (randomize_tasks) {
+            prompt_lines.erase( std::next(prompt_lines.begin(),idx*6)  , std::next(prompt_lines.begin(),idx*6+6) );
+        }
+    }
+
+    LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
+
+    LOG("\ntask\tacc_norm\t95%% confidence interval\n");
+
+    double acc = 0.0f;
+
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_batch = params.n_batch;
+
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
+    const int max_tasks_per_batch = 32;
+    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
+
+    llama_batch batch = llama_batch_init(n_ctx, 0, 4);
+
+    std::vector<float> tok_logits(n_vocab);
+    // TODO: this could be made smaller; it's currently the worst-case size
+    std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
+
+    std::vector<std::pair<size_t, llama_token>> eval_pairs;
+    std::vector<float> eval_results;
+    std::vector<std::thread> workers(std::thread::hardware_concurrency());
+
+    for (size_t i0 = 0; i0 < hs_task_count; i0++) {
+        int n_cur = 0;
+
+        size_t i1 = i0;
+        size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
+
+        common_batch_clear(batch);
+
+        // batch as much tasks as possible into the available context
+        // each task has 4 unique sequence ids - one for each ending
+        // the common prefix is shared among the 4 sequences to save tokens
+        // we extract logits only from the last common token and from all ending tokens of each sequence
+        while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
+            auto & hs_cur = hs_data[i1];
+            int n_logits = 0;
+
+            const int s0 = 4*(i1 - i0);
+            if (s0 + 4 > max_seq) {
+                break;
+            }
+
+            for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
+                common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
+            }
+            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+            n_logits += 1;
+
+            for (int s = 0; s < 4; ++s) {
+                const size_t seq_tokens_size = hs_cur.seq_tokens[s].size();
+                // TODO: don't evaluate the last token of each sequence
+                for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
+                    const bool needs_logits = i < seq_tokens_size - 1;
+                    common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
+                    n_logits += needs_logits;
+                }
+            }
+
+            hs_cur.i_logits = i_logits;
+            i_logits += n_logits;
+
+            n_cur += hs_data[i1].required_tokens;
+            if (++i1 == hs_task_count) {
+                break;
+            }
+        }
+
+        if (i0 == i1) {
+            LOG_ERR("%s : task %zu does not fit in the context window (requires %lu tokens)\n", __func__, i0, hs_data[i0].required_tokens);
+            return;
+        }
+
+        llama_memory_clear(llama_get_memory(ctx), true);
+
+        // decode all tasks [i0, i1)
+        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            return;
+        }
+
+        // Compute log-probs in parallel
+        // First we collect all tasks
+        eval_pairs.clear();
+        for (size_t i = i0; i < i1; ++i) {
+            auto & hs_cur = hs_data[i];
+            size_t li = 1; // skip the last logit of the common prefix (computed separately below)
+            for (int s = 0; s < 4; ++s) {
+                for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
+                    eval_pairs.emplace_back(hs_cur.i_logits + li++, hs_cur.seq_tokens[s][j + 1]);
+                }
+            }
+        }
+        // Then we do the actual calculation
+        compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
+
+        size_t ir = 0;
+
+        // compute the logprobs for each ending of the decoded tasks
+        for (size_t i = i0; i < i1; ++i) {
+            auto & hs_cur = hs_data[i];
+
+            // get the logits of the last token of the common prefix
+            std::memcpy(tok_logits.data(), batch_logits.data() + hs_cur.i_logits*n_vocab, n_vocab*sizeof(float));
+
+            const auto first_probs = softmax(tok_logits);
+
+            for (int s = 0; s < 4; ++s) {
+                hs_cur.ending_logprob_count[s] = 1;
+                hs_cur.ending_logprob[s] = std::log(first_probs[hs_cur.seq_tokens[s][hs_cur.common_prefix]]);
+                for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
+                    hs_cur.ending_logprob[s] += eval_results[ir++];
+                    hs_cur.ending_logprob_count[s]++;
+                }
+                hs_cur.ending_logprob[s] /= hs_cur.ending_logprob_count[s];
+            }
+
+            // Find the ending with maximum logprob
+            size_t ending_logprob_max_idx = 0;
+            double ending_logprob_max_val = hs_cur.ending_logprob[0];
+            for (size_t s = 1; s < 4; s++) {
+                if (hs_cur.ending_logprob[s] > ending_logprob_max_val) {
+                    ending_logprob_max_idx = s;
+                    ending_logprob_max_val =  hs_cur.ending_logprob[s];
+                }
+            }
+
+            //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
+
+            // If the gold ending got the maximum logprobe add one accuracy point
+            if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
+                acc += 1.0;
+            }
+
+            double freq = acc / double(i + 1);
+
+            const double za = 1.95996398454;
+
+            // // Wald normal approx
+            // double conf =za*sqrt(freq*(1-freq)/double(i + 1));
+            // LOG("%zu\t%.8lf +/- %.8lf\n", i + 1, freq*100.0, conf*100.0);
+
+            // Wilson score interval, more accurate
+            double z   = za * za / double(i + 1);
+            double cnf = z * sqrt(double(i + 1) * (4.0 * freq * (1 - freq) + z)) / (za + za);
+            double a   = (freq + z * 0.5 - cnf) / (1.0 + z);
+            double b   = (freq + z * 0.5 + cnf) / (1.0 + z);
+
+            // Print the accumulated accuracy mean x 100 and confidence interval
+            LOG("%zu\t%3.8lf%%\t[%3.4lf%%, %3.4lf%%]\n", i + 1, freq * 100.0, a * 100.0, b * 100.0);
+        }
+
+        i0 = i1 - 1;
+    }
+
+    llama_batch_free(batch);
+
+    LOG("\n");
+}
+
+struct winogrande_entry {
+    std::string first;
+    std::string second;
+    std::array<std::string, 2> choices;
+    int answer;
+
+    size_t i_logits;
+    size_t common_prefix;
+    size_t required_tokens;
+    size_t n_base1; // number of tokens for context + choice 1
+    size_t n_base2; // number of tokens for context + choice 2
+    std::vector<llama_token> seq_tokens[2];
+};
+
+static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
+    std::vector<winogrande_entry> result;
+    std::istringstream in(prompt);
+    std::string line;
+    std::array<int, 4> comma_pos;
+    while (true) {
+        std::getline(in, line);
+        if (in.fail() || in.eof()) break;
+        int ipos = 0;
+        bool quote_open = false;
+        for (int i = 0; i < int(line.size()); ++i) {
+            if (!quote_open) {
+                if (line[i] == ',') {
+                    comma_pos[ipos++] = i;
+                    if (ipos == 4) break;
+                }
+                else if (line[i] == '"') {
+                    quote_open = true;
+                }
+            }
+            else {
+                if (line[i] == '"') {
+                    quote_open = false;
+                }
+            }
+        }
+        if (ipos != 4) {
+            LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
+            continue;
+        }
+        auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
+                                                    : line.substr(comma_pos[0]+1, comma_pos[1] - comma_pos[0] - 1);
+        auto choice1 = line.substr(comma_pos[1]+1, comma_pos[2] - comma_pos[1] - 1);
+        auto choice2 = line.substr(comma_pos[2]+1, comma_pos[3] - comma_pos[2] - 1);
+        auto answer  = line.substr(comma_pos[3]+1, line.size() - comma_pos[3] - 1);
+        auto index = line.substr(0, comma_pos[0]);
+        int where = 0;
+        for ( ; where < int(sentence.size()); ++where) {
+            if (sentence[where] == '_') break;
+        }
+        if (where == int(sentence.size())) {
+            LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
+            continue;
+        }
+        std::istringstream stream(answer.c_str());
+        int i_answer; stream >> i_answer;
+        if (stream.fail() || i_answer < 1 || i_answer > 2) {
+            LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
+            continue;
+        }
+        result.emplace_back();
+        auto& wg = result.back();
+        wg.first = sentence.substr(0, where);
+        wg.second = sentence.substr(where + 1, sentence.size() - where - 1);
+        wg.choices[0] = std::move(choice1);
+        wg.choices[1] = std::move(choice2);
+        wg.answer = i_answer;
+    }
+    return result;
+}
+
+/*
+ * Evaluates the Winogrande score.
+ * Uses a CSV containing task index, dentence, choice 1, choice 2, answer (1 or 2)
+ * You can get one such dataset from e.g. https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp
+ * As an example, the 1st row in the above dataset is
+ *
+ *    0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
+ *
+ */
+static void winogrande_score(llama_context * ctx, const common_params & params) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    constexpr int k_min_trailing_ctx = 3;
+
+    auto data = load_winogrande_from_csv(params.prompt);
+    if (data.empty()) {
+        LOG_ERR("%s: no tasks\n", __func__);
+        return;
+    }
+
+    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
+
+    if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
+        LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
+        std::mt19937 rng(1);
+        std::vector<int> aux(data.size());
+        for (int i = 0; i < int(data.size()); ++i) {
+            aux[i] = i;
+        }
+        float scale = 1/(1.f + (float)rng.max());
+        std::vector<winogrande_entry> selected;
+        selected.resize(params.winogrande_tasks);
+        for (int i = 0; i < int(params.winogrande_tasks); ++i) {
+            int j = int(scale*rng()*aux.size());
+            selected[i] = std::move(data[aux[j]]);
+            aux[j] = aux.back();
+            aux.pop_back();
+        }
+        data = std::move(selected);
+    }
+
+    LOG_INF("%s : tokenizing selected tasks\n", __func__);
+
+    for (auto & task : data) {
+        task.seq_tokens[0] = common_tokenize(ctx, task.first + task.choices[0] + task.second, true);
+        task.seq_tokens[1] = common_tokenize(ctx, task.first + task.choices[1] + task.second, true);
+
+        task.common_prefix = 0;
+        for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
+            if (task.seq_tokens[0][k] != task.seq_tokens[1][k]) {
+                break;
+            }
+            task.common_prefix++;
+        }
+
+        // TODO: the last token of each of the sequences don't need to be evaluated
+        task.required_tokens = task.common_prefix +
+            task.seq_tokens[0].size() - task.common_prefix +
+            task.seq_tokens[1].size() - task.common_prefix;
+
+        task.n_base1 = common_tokenize(ctx, task.first + task.choices[0], true).size();
+        task.n_base2 = common_tokenize(ctx, task.first + task.choices[1], true).size();
+    }
+
+    LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
+
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_batch = params.n_batch;
+
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
+    const int max_tasks_per_batch = 128;
+    const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
+
+    llama_batch batch = llama_batch_init(n_ctx, 0, 2);
+
+    std::vector<float> tok_logits(n_vocab);
+    // TODO: this could be made smaller; it's currently the worst-case size
+    std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
+
+    std::vector<std::pair<size_t, llama_token>> eval_pairs;
+    std::vector<float> eval_results;
+    std::vector<std::thread> workers(std::thread::hardware_concurrency());
+
+    int n_correct = 0;
+    int n_done    = 0;
+
+    for (size_t i0 = 0; i0 < data.size(); i0++) {
+        int n_cur = 0;
+
+        size_t i1 = i0;
+        size_t i_logits = 0;
+
+        common_batch_clear(batch);
+
+        while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
+            int n_logits = 0;
+            const int s0 = 2*(i1 - i0);
+            if (s0 + 2 > max_seq) {
+                break;
+            }
+
+            for (size_t i = 0; i < data[i1].common_prefix; ++i) {
+                common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
+            }
+            batch.logits[batch.n_tokens - 1] = true;
+            n_logits += 1;
+
+            for (int s = 0; s < 2; ++s) {
+                // TODO: end before the last token, no need to predict past the end of the sequences
+                for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
+                    common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
+                    n_logits += 1;
+                }
+            }
+
+            data[i1].i_logits = i_logits;
+            i_logits += n_logits;
+
+            n_cur += data[i1].required_tokens;
+            if (++i1 == data.size()) {
+                break;
+            }
+        }
+
+        if (i0 == i1) {
+            LOG_ERR("%s : task %zu does not fit in the context window (requires %lu tokens)\n", __func__, i0, data[i0].required_tokens);
+            return;
+        }
+
+        llama_memory_clear(llama_get_memory(ctx), true);
+
+        // decode all tasks [i0, i1)
+        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            return;
+        }
+
+        eval_pairs.clear();
+        for (size_t i = i0; i < i1; ++i) {
+            auto & task = data[i];
+
+            const bool skip_choice =
+                task.seq_tokens[0].size() - task.common_prefix > k_min_trailing_ctx &&
+                task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx;
+
+            const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
+            const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
+            size_t li = n_base1 - task.common_prefix;
+            for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
+                eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[0][j+1]);
+            }
+            const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
+            const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
+            // FIXME: this uses the wrong first logits when not skipping the choice word
+            li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - task.common_prefix;
+            for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
+                eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[1][j+1]);
+            }
+        }
+        compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
+
+        size_t ir = 0;
+        for (size_t i = i0; i < i1; ++i) {
+            auto & task = data[i];
+
+            const bool skip_choice =
+                task.seq_tokens[0].size() - task.common_prefix > k_min_trailing_ctx &&
+                task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx;
+
+            float score_1st = 0;
+            const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
+            const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
+            for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
+                score_1st += eval_results[ir++];
+            }
+            score_1st /= (task.seq_tokens[0].size() - n_base1 - last_1st);
+
+            float score_2nd = 0;
+            const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
+            const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
+            for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
+                score_2nd += eval_results[ir++];
+            }
+            score_2nd /= (task.seq_tokens[1].size() - n_base2 - last_2nd);
+
+            int result = score_1st > score_2nd ? 1 : 2;
+
+            if (result == task.answer) {
+                ++n_correct;
+            }
+            ++n_done;
+
+            // print the accumulated accuracy mean x 100
+            LOG("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
+        }
+
+        i0 = i1 - 1;
+    }
+
+    LOG("\n");
+
+    if (n_done < 100) return;
+
+    const float p = 1.f*n_correct/n_done;
+    const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
+
+    LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
+}
+
+static bool deserialize_string(std::istream & in, std::string & str) {
+    uint32_t size;
+    if (!in.read((char *)&size, sizeof(size)).fail()) {
+        str.resize(size);
+        if (!in.read((char *)&str[0], size).fail()) return true;
+    }
+    return false;
+}
+
+struct multiple_choice_answers {
+    std::vector<std::string> answers;
+    std::vector<int>         labels;
+    bool deserialize(std::istream& in) {
+        uint32_t n;
+        in.read((char *)&n, sizeof(n));
+        if (in.fail() || n > 100) return false; // 100 as max. number of answers should be good enough for any practical purpose
+        answers.resize(n);
+        labels.resize(n);
+        for (auto& a : answers) {
+            if (!deserialize_string(in, a)) return false;
+        }
+        in.read((char *)labels.data(), n*sizeof(int));
+        return !in.fail();
+    }
+};
+
+struct multiple_choice_task {
+    std::string question;         // the question (or context that needs to be continued)
+    multiple_choice_answers mc1;  // possible answers (continuations) with a single correct answer
+    multiple_choice_answers mc2;  // possible answers (continuations) with multiple correct answers - not handled yet
+    bool deserialize(std::istream& in) {
+        if (!deserialize_string(in, question)) return false;
+        return mc1.deserialize(in) && mc2.deserialize(in);
+    }
+
+    // For evaluation
+    size_t i_logits;        // starting index of logits in the llama_batch
+    size_t common_prefix;   // max number of initial tokens that are the same in all sentences
+    size_t required_tokens; // needed number of tokens to evaluate all answers
+    std::vector<std::vector<llama_token>> seq_tokens;
+    std::vector<float> log_probs;
+};
+
+static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
+    if (task.question.empty() || task.mc1.answers.empty()) {
+        if (log_error) {
+            LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
+        }
+        return false;
+    }
+    task.seq_tokens.reserve(task.mc1.answers.size());
+    for (auto& answer : task.mc1.answers) {
+        if (answer.empty()) {
+            if (log_error) {
+                LOG_ERR("%s: found empty answer\n", __func__);
+            }
+            return false;
+        }
+        task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true));
+    }
+    auto min_len = task.seq_tokens.front().size();
+    for (auto& seq : task.seq_tokens) {
+        min_len = std::min(min_len, seq.size());
+    }
+    task.common_prefix = 0;
+    for (size_t k = 0; k < min_len; ++k) {
+        auto token = task.seq_tokens[0][k];
+        bool all_same = true;
+        for (size_t i = 1; i < task.seq_tokens.size(); ++i) {
+            if (task.seq_tokens[i][k] != token) {
+                all_same = false;
+                break;
+            }
+        }
+        if (!all_same) {
+            break;
+        }
+        ++task.common_prefix;
+    }
+    task.required_tokens = task.common_prefix;
+    for (auto& seq : task.seq_tokens) {
+        task.required_tokens += seq.size() - task.common_prefix;
+    }
+    return true;
+}
+
+//
+// Calculates score for multiple choice tasks with single correct answer from prompt.
+// Commonly used LLM evaluation metrics of this type are
+//   * ARC
+//   * HellaSwag
+//   * MMLU
+//   * TruthfulQA
+//
+// Validation datasets for these 4 tests can be found at
+//     https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
+// The data for these datasets was extracted from
+//     git@hf.co:datasets/allenai/ai2_arc
+//     https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
+//     git@hf.co:datasets/Stevross/mmlu
+//     https://huggingface.co/datasets/truthful_qa
+//
+static void multiple_choice_score(llama_context * ctx, const common_params & params) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    std::istringstream strstream(params.prompt);
+    uint32_t n_task;
+    strstream.read((char *)&n_task, sizeof(n_task));
+    if (strstream.fail() || n_task == 0) {
+        LOG_ERR("%s: no tasks\n", __func__);
+        return;
+    }
+    LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
+    std::vector<uint32_t> task_pos(n_task);
+    strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
+    if (strstream.fail()) {
+        LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
+        return;
+    }
+
+    std::vector<multiple_choice_task> tasks;
+    if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
+        // Use all tasks
+        tasks.resize(n_task);
+        LOG_INF("%s: reading tasks", __func__);
+        int n_dot = std::max((int) n_task/100, 1);
+        int i = 0;
+        for (auto& task : tasks) {
+            ++i;
+            if (!task.deserialize(strstream)) {
+                LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
+                return;
+            }
+            if (i%n_dot == 0) LOG(".");
+        }
+        LOG("done\n");
+    }
+    else {
+        LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
+        std::mt19937 rng(1);
+        std::vector<int> aux(n_task);
+        for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
+        float scale = 1.f/(1.f + (float)std::mt19937::max());
+        tasks.resize(params.multiple_choice_tasks);
+        for (auto& task : tasks) {
+            int j = (int)(scale * rng() * aux.size());
+            int idx = aux[j];
+            aux[j] = aux.back();
+            aux.pop_back();
+            strstream.seekg(task_pos[idx], std::ios::beg);
+            if (!task.deserialize(strstream)) {
+                LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
+                return;
+            }
+        }
+        n_task = params.multiple_choice_tasks;
+    }
+
+    LOG_INF("%s: preparing task data", __func__);
+    if (n_task > 500) {
+        LOG("...");
+        std::atomic<int> counter(0);
+        std::atomic<int> n_bad(0);
+        auto prepare = [&counter, &n_bad, &tasks, ctx] () {
+            int num_tasks = tasks.size();
+            int n_bad_local = 0;
+            while (true) {
+                int first = counter.fetch_add(K_TOKEN_CHUNK);
+                if (first >= num_tasks) {
+                    if (n_bad_local > 0) n_bad += n_bad_local;
+                    break;
+                }
+                int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
+                for (int i = first; i < last; ++i) {
+                    if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local;
+                }
+            }
+        };
+        size_t max_thread = std::thread::hardware_concurrency();
+        max_thread = std::min(max_thread, (tasks.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK);
+        std::vector<std::thread> workers(max_thread-1);
+        for (auto& w : workers) w = std::thread(prepare);
+        prepare();
+        for (auto& w : workers) w.join();
+        LOG("done\n");
+        int nbad = n_bad;
+        if (nbad > 0) {
+            LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
+            return;
+        }
+    } else {
+        int n_dot = std::max((int) n_task/100, 1);
+        int i_task = 0;
+        for (auto& task : tasks) {
+            ++i_task;
+            if (!multiple_choice_prepare_one_task(ctx, task, true)) {
+                return;
+            }
+            if (i_task%n_dot == 0) {
+                LOG(".");
+            }
+        }
+        LOG("done\n");
+    }
+
+    LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
+
+    LOG("\ntask\tacc_norm\n");
+
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_batch = params.n_batch;
+
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
+    const int max_tasks_per_batch = 32;
+    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
+
+    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+
+    std::vector<float> tok_logits(n_vocab);
+    std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
+
+    std::vector<std::pair<size_t, llama_token>> eval_pairs;
+    std::vector<float> eval_results;
+    std::vector<std::thread> workers(std::thread::hardware_concurrency());
+    std::vector<int> batch_indeces;
+
+    int n_done = 0;
+    int n_correct = 0;
+    int n_tot_answers = 0;
+
+    for (size_t i0 = 0; i0 < tasks.size(); i0++) {
+        int n_cur = 0;
+
+        size_t i1 = i0;
+        size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
+
+        common_batch_clear(batch);
+
+        // batch as much tasks as possible into the available context
+        // each task has 4 unique sequence ids - one for each ending
+        // the common prefix is shared among the 4 sequences to save tokens
+        // we extract logits only from the last common token and from all ending tokens of each sequence
+        int s0 = 0;
+        while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
+            auto& cur_task = tasks[i1];
+            int n_logits = 0;
+
+            int num_answers = cur_task.seq_tokens.size();
+            if (s0 + num_answers > max_seq) {
+                if (s0 == 0) {
+                    LOG_ERR("%s : task %zu requires a higher -np|--parallel value (at least %d)\n", __func__, i0, num_answers);
+                    return;
+                }
+                break;
+            }
+
+            if (int(batch_indeces.size()) != num_answers) {
+                batch_indeces.resize(num_answers);
+            }
+
+            for (int s = 0; s < num_answers; ++s) {
+                batch_indeces[s] = s0 + s;
+            }
+
+            for (size_t i = 0; i < cur_task.common_prefix; ++i) {
+                //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
+                common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
+            }
+            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+            n_logits += 1;
+
+            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
+                const size_t seq_tokens_size = cur_task.seq_tokens[s].size();
+                // TODO: don't evaluate the last token of each sequence
+                for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
+                    const bool needs_logits = i < seq_tokens_size - 1;
+                    common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
+                    n_logits += needs_logits;
+                }
+            }
+
+            s0 += num_answers;
+
+            cur_task.i_logits = i_logits;
+            i_logits += n_logits;
+
+            n_cur += cur_task.required_tokens;
+            if (++i1 == tasks.size()) {
+                break;
+            }
+        }
+
+        if (i0 == i1) {
+            LOG_ERR("%s : task %zu does not fit in the context window (requires %lu tokens)\n", __func__, i0, tasks[i0].required_tokens);
+            return;
+        }
+
+        llama_memory_clear(llama_get_memory(ctx), true);
+
+        // decode all tasks [i0, i1)
+        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            return;
+        }
+
+        // Compute log-probs in parallel
+        // First we collect all tasks
+        eval_pairs.clear();
+        for (size_t i = i0; i < i1; ++i) {
+            auto& cur_task = tasks[i];
+            size_t li = 1; // skip the last logit of the common prefix (computed separately below)
+            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
+                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
+                    eval_pairs.emplace_back(cur_task.i_logits + li++, cur_task.seq_tokens[s][j + 1]);
+                }
+            }
+        }
+        // Then we do the actual calculation
+        compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
+
+        size_t ir = 0;
+
+        // compute the logprobs for each ending of the decoded tasks
+        for (size_t i = i0; i < i1; ++i) {
+            auto & cur_task = tasks[i];
+            //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
+            //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
+            //    if (cur_task.mc1.labels[j] == 1) {
+            //        LOG("%d", j+1);
+            //    }
+            //}
+            //LOG("\n    common_prefix: %zu\n", cur_task.common_prefix);
+
+            // get the logits of the last token of the common prefix
+            std::memcpy(tok_logits.data(), batch_logits.data() + cur_task.i_logits*n_vocab, n_vocab*sizeof(float));
+
+            const auto first_probs = softmax(tok_logits);
+
+            cur_task.log_probs.resize(cur_task.seq_tokens.size());
+            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
+                size_t count = 1;
+                float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
+                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
+                    //LOG("        %zu  %g\n", ir, eval_results[ir]);
+                    ++count;
+                    log_prob += eval_results[ir++];
+                }
+                cur_task.log_probs[s] = log_prob / count;
+                //LOG("        Final: %g\n", log_prob / count);
+                //LOG("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
+            }
+
+            // Find the ending with maximum logprob
+            size_t logprob_max_idx = 0;
+            float  logprob_max_val = cur_task.log_probs[0];
+            for (size_t s = 1; s < cur_task.log_probs.size(); s++) {
+                if (cur_task.log_probs[s] > logprob_max_val) {
+                    logprob_max_val = cur_task.log_probs[s];
+                    logprob_max_idx = s;
+                }
+            }
+
+            n_tot_answers += cur_task.log_probs.size();
+            if (cur_task.mc1.labels[logprob_max_idx] == 1) {
+                ++n_correct;
+            }
+            ++n_done;
+
+            // Print the accumulated accuracy mean x 100
+            LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
+        }
+
+        i0 = i1 - 1;
+    }
+
+    llama_batch_free(batch);
+
+    if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
+
+    float p = 1.f*n_correct/n_done;
+    float sigma = sqrt(p*(1-p)/(n_done-1));
+    LOG("\n");
+    LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    p = 1.f*n_done/n_tot_answers;
+    sigma = sqrt(p*(1-p)/(n_done-1));
+    LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+
+    LOG_INF("\n");
+}
+
+static void kl_divergence(llama_context * ctx, const common_params & params) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    if (params.logits_file.empty()) {
+        LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
+        return;
+    }
+    std::ifstream in(params.logits_file.c_str(), std::ios::binary);
+    if (!in) {
+        LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
+        return;
+    }
+    {
+        char check[9]; check[8] = 0;
+        in.read(check, 8);
+        if (in.fail() || strncmp("_logits_", check, 8) != 0) {
+            LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
+            return;
+        }
+    }
+
+    uint32_t n_ctx;
+    in.read((char *)&n_ctx, sizeof(n_ctx));
+    if (n_ctx > llama_n_ctx(ctx)) {
+        LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
+                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
+    }
+
+    int n_vocab;
+    int n_chunk;
+    in.read((char *)&n_vocab, sizeof(n_vocab));
+    in.read((char *)&n_chunk, sizeof(n_chunk));
+    if (in.fail()) {
+        LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
+        return;
+    }
+    if (n_vocab != llama_vocab_n_tokens(vocab)) {
+        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_tokens(vocab));
+    }
+
+    std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
+    if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
+        LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
+        return;
+    }
+
+    const int n_batch = params.n_batch;
+    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
+    const int nv = 2*((n_vocab + 1)/2) + 4;
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
+    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
+
+    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
+    std::vector<float>    kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
+    std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
+    std::vector<float> logits;
+    if (num_batches > 1) {
+        logits.reserve(size_t(n_ctx) * n_vocab);
+    }
+
+    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
+
+    auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) {
+        if (count < 1) {
+            return std::make_pair(0., 0.);
+        }
+        double f = sum/count;
+        double df = sum2/count - f*f;
+        df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
+        return std::make_pair(f, df);
+    };
+    auto covariance = [] (double suma, double sumb, double sumab, size_t count) {
+        if (count < 10) {
+            return 0.0;
+        }
+        double var = sumab/count - (suma/count)*(sumb/count);
+        var /= count - 1;
+        return var;
+    };
+
+    kl_divergence_result kld;
+    auto    kld_ptr =    kld_values.data();
+    auto p_diff_ptr = p_diff_values.data();
+
+    for (int i = 0; i < n_chunk; ++i) {
+        const int start =     i * n_ctx;
+        const int end   = start + n_ctx;
+
+        const auto t_start = std::chrono::high_resolution_clock::now();
+
+        if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
+            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
+            return;
+        }
+
+        // clear the KV cache
+        llama_memory_clear(llama_get_memory(ctx), true);
+
+        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
+        for (int j = 0; j < num_batches; ++j) {
+            const int batch_start = start + j * n_batch;
+            const int batch_size  = std::min(end - batch_start, n_batch);
+
+            // save original token and restore it after eval
+            const auto token_org = tokens[batch_start];
+
+            // add BOS token for the first batch of each chunk
+            if (add_bos && j == 0) {
+                tokens[batch_start] = llama_vocab_bos(vocab);
+            }
+
+            common_batch_clear(batch);
+            for (int i = 0; i < batch_size; i++) {
+                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            }
+
+            if (llama_decode(ctx, batch)) {
+                LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
+                return;
+            }
+
+            // restore the original token in case it was set to BOS
+            tokens[batch_start] = token_org;
+
+            if (num_batches > 1) {
+                const auto * batch_logits = llama_get_logits(ctx);
+                logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
+            }
+        }
+
+        llama_batch_free(batch);
+
+        const auto t_end = std::chrono::high_resolution_clock::now();
+
+        if (i == 0) {
+            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            int total_seconds = (int)(t_total * n_chunk);
+            if (total_seconds >= 60*60) {
+                LOG("%d hours ", total_seconds / (60*60));
+                total_seconds = total_seconds % (60*60);
+            }
+            LOG("%.2f minutes\n", total_seconds / 60.0);
+        }
+        LOG("\n");
+        LOG("chunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
+
+        const int first = n_ctx/2;
+        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+        process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
+        p_diff_ptr += n_ctx - 1 - first;
+        kld_ptr    += n_ctx - 1 - first;
+
+        LOG("%4d", i+1);
+
+        auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
+        const double ppl_val = exp(log_ppl.first);
+        const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
+        LOG("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
+
+        auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
+        const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
+        const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
+        const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
+        LOG("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
+
+        auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
+        LOG("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
+
+        auto p_diff_mse   = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
+        const double p_diff_rms_val = sqrt(p_diff_mse.first);
+        const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
+        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+
+        double p_top_val = 1.*kld.n_same_top/kld.count;
+        double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
+        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
+
+        LOG("\n");
+
+        logits.clear();
+    }
+    LOG("\n");
+
+    if (kld.count < 100) return; // we do not wish to do statistics on so few values
+
+    std::sort(kld_values.begin(), kld_values.end());
+    std::sort(p_diff_values.begin(), p_diff_values.end());
+
+    LOG("====== Perplexity statistics ======\n");
+
+    auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
+    const double ppl_val = exp(log_ppl.first);
+    const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
+    LOG("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
+
+    auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
+    const double ppl_base_val = exp(log_ppl_base.first);
+    const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
+    LOG("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
+
+    const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
+    // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
+    const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
+    LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
+
+    const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
+    const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
+    LOG("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
+
+    const double ppl_ratio_val = exp(log_ppl_ratio_val);
+    const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
+    LOG("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
+
+    const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
+    const double ppl_diff_val = ppl_val - ppl_base_val;
+    const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
+    LOG("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
+
+    LOG("\n");
+
+    LOG("====== KL divergence statistics ======\n");
+    auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
+    LOG("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
+    auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
+                                               : kld_values[kld_values.size()/2];
+
+    auto percentile = [] (std::vector<float> values, float fraction) {
+        if (fraction <= 0) return values.front();
+        if (fraction >= 1) return values.back();
+        float p = fraction*(values.size() - 1);
+        size_t ip = size_t(p); p -= ip;
+        return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
+    };
+
+    LOG("Maximum KLD: %10.6f\n", kld_values.back());
+    LOG("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
+    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    LOG("95.0%%   KLD: %10.6f\n", percentile(kld_values, 0.950f));
+    LOG("90.0%%   KLD: %10.6f\n", percentile(kld_values, 0.900f));
+    LOG("Median  KLD: %10.6f\n", kld_median);
+    LOG("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
+    LOG(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
+    LOG(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
+    LOG(" 0.1%%   KLD: %10.6f\n", percentile(kld_values, 0.001f));
+    LOG("Minimum KLD: %10.6f\n", kld_values.front());
+
+    LOG("\n");
+
+    LOG("====== Token probability statistics ======\n");
+
+    auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
+    LOG("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
+
+    auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
+                                               : p_diff_values[p_diff_values.size()/2];
+
+    LOG("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
+    LOG("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
+    LOG("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
+    LOG("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
+    LOG("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
+    LOG("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
+    LOG("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
+    LOG("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
+    LOG("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
+    LOG(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
+    LOG(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
+    LOG(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
+    LOG("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
+
+    auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
+    // LOG("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
+
+    const double p_diff_rms_val = sqrt(p_diff_mse.first);
+    const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
+    LOG("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+
+    const double same_top_p = 1.0*kld.n_same_top/kld.count;
+    LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.n_ctx = 512;
+    params.escape = false;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
+        return 1;
+    }
+
+    common_init();
+
+    const int32_t n_ctx = params.n_ctx;
+
+    if (n_ctx <= 0) {
+        LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
+        return 1;
+    }
+
+    const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
+
+    if (ppl) {
+        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
+        const int32_t n_kv = n_seq * n_ctx;
+
+        params.n_parallel = n_seq;
+        params.n_ctx      = n_kv;
+
+        params.n_batch = std::min(params.n_batch, n_kv);
+    } else {
+        params.n_batch = std::min(params.n_batch, params.n_ctx);
+        if (params.kl_divergence) {
+            params.n_parallel = 1;
+        } else {
+            // ensure there's at least enough seq_ids for HellaSwag
+            params.n_parallel = std::max(4, params.n_parallel);
+        }
+    }
+
+    if (params.ppl_stride > 0) {
+        LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
+                params.n_ctx, params.n_ctx + params.ppl_stride/2);
+        params.n_ctx += params.ppl_stride/2;
+    }
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // load the model and apply lora adapter, if any
+    auto llama_init = common_init_from_params(params);
+
+    auto * model = llama_init->model();
+    auto * ctx   = llama_init->context();
+
+    if (model == NULL) {
+        LOG_ERR("%s: unable to load model\n", __func__);
+        return 1;
+    }
+
+    const int n_ctx_train = llama_model_n_ctx_train(model);
+
+    if (params.n_ctx > n_ctx_train) {
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, params.n_ctx);
+    }
+
+    // print system information
+    {
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    }
+
+    struct results_perplexity results;
+    if (params.hellaswag) {
+        hellaswag_score(ctx, params);
+    } else if (params.winogrande) {
+        winogrande_score(ctx, params);
+    } else if (params.multiple_choice) {
+        multiple_choice_score(ctx, params);
+    } else if (params.kl_divergence) {
+        kl_divergence(ctx, params);
+    } else {
+        results = perplexity(ctx, params, n_ctx);
+    }
+
+    LOG("\n");
+    llama_perf_context_print(ctx);
+    llama_memory_breakdown_print(ctx);
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/llama.cpp/tools/quantize/CMakeLists.txt b/llama.cpp/tools/quantize/CMakeLists.txt
new file mode 100644
index 0000000..bd9ddbd
--- /dev/null
+++ b/llama.cpp/tools/quantize/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(TARGET llama-quantize)
+add_executable(${TARGET} quantize.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_include_directories(${TARGET} PRIVATE ../../common)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/llama.cpp/tools/quantize/README.md b/llama.cpp/tools/quantize/README.md
new file mode 100644
index 0000000..22f0710
--- /dev/null
+++ b/llama.cpp/tools/quantize/README.md
@@ -0,0 +1,171 @@
+# quantize
+
+This tool takes a GGUF input model file, typically in a high-precision format like F32 or BF16, and converts it to a quantized format.
+Quantization reduces the precision of model weights (e.g., from 32-bit floats to 4-bit integers), which shrinks the model's size and can speed up inference.
+This process however, may introduce some accuracy loss which is usually measured in [Perplexity](https://huggingface.co/docs/transformers/en/perplexity) (ppl) and/or [Kullback–Leibler Divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) (kld).
+This can be minimized by using a suitable imatrix file.
+
+You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
+
+Note: It is synced from llama.cpp `main` every 6 hours.
+
+Example usage:
+
+```./llama-quantize [options] input-model-f32.gguf [output-model-quant.gguf] type [threads]```
+
+```bash
+# from Hugginface, obtain the official meta-llama/Llama-3.1-8B model weights and place them in ./models
+ls ./models
+config.json             model-00001-of-00004.safetensors  model-00004-of-00004.safetensors  README.md                tokenizer.json
+generation_config.json  model-00002-of-00004.safetensors  model.safetensors.index.json      special_tokens_map.json  USE_POLICY.md
+LICENSE                 model-00003-of-00004.safetensors  original                          tokenizer_config.json
+
+# [Optional] for PyTorch .bin models like Mistral-7B
+ls ./models
+<folder containing weights and tokenizer json>
+
+# install Python dependencies
+python3 -m pip install -r requirements.txt
+
+# convert the model to ggml FP16 format
+python3 convert_hf_to_gguf.py ./models/mymodel/
+
+# quantize the model to 4-bits (using Q4_K_M method)
+./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
+
+# update the gguf filetype to current version if older version is now unsupported
+./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
+```
+
+Run the quantized model:
+
+```bash
+# start inference on a gguf model
+./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant"
+```
+
+Options:
+* `--allow-requantize` allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit
+* `--leave-output-tensor` will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing
+* `--pure` disables k-quant mixtures and quantizes all tensors to the same type
+* `--imatrix` uses data in file generated by `llama-imatrix` as importance matrix for quant optimizations (highly recommended)
+* `--include-weights` use an importance matrix for tensor(s) in the list. Cannot be used with `--exclude-weights`
+* `--exclude-weights` use an importance matrix for tensor(s) in the list. Cannot be used with `--include-weights`
+* `--output-tensor-type` use a specific quant type for the output.weight tensor
+* `--token-embedding-type` use a specific quant type for the token embeddings tensor
+* `--keep-split` will generate the quantized model in the same shards as the input file otherwise it will produce a single quantized file
+
+Advanced options:
+* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times.
+* `--prune-layers` prune (remove) the layers in the list
+* `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times
+
+Examples:
+
+```bash
+# naive Q4_K_M quantization using default settings and 8 CPU threads. Output will be "ggml-model-Q4_K_M.gguf"
+./llama-quantize input-model-f32.gguf q4_k_m 8
+```
+
+```bash
+#  quantize model enabling re-quantization, leaving the output tensor unquantized and all others quantized at the same level (Q4_K)
+./llama-quantize --allow-requantize --leave-output-tensor --pure input-model-f32.gguf q4_k_m 8
+```
+
+```bash
+# quantize model using an importance matrix for specified tensors only (attn_v and ffn_down)
+./llama-quantize --imatrix imatrix.gguf --include-weights attn_v --include-weights ffn_down input-model-f32.gguf q4_k_m 8
+```
+
+```bash
+# quantize model setting output tensor to Q5_K_M, token embeddings to Q3_K_M, and keeping the input file's shards
+./llama-quantize --imatrix imatrix.gguf --output-tensor-type q5_k --token-embedding-type q3_k --keep-split input-model-f32.gguf q4_k_m 8
+```
+
+```bash
+# quantize model using a regex to quantize attn_k tensors in odd layers to Q5_K_M and attn_q tensors in even layers to Q3_K_M
+./llama-quantize --imatrix imatrix.gguf --tensor-type "\.(\d*[13579])\.attn_k=q5_k" --tensor-type "\.(\d*[02468])\.attn_q=q3_k" input-model-f32.gguf q4_k_m 8
+```
+
+```bash
+# quantize model setting tensors attn_v and ffn_down to Q5_K_M and pruning layers 20, 21, and 22
+./llama-quantize --imatrix imatrix.gguf --tensor-type attn_v=q5_k --tensor-type ffn_down=q5_k --prune-layers 20,21,22 input-model-f32.gguf q4_k_m 8
+```
+
+```bash
+# override expert used count metadata to 16, prune layers 20, 21, and 22 without quantizing the model (copy tensors) and use specified name for the output file
+./llama-quantize --imatrix imatrix.gguf --override-kv qwen3moe.expert_used_count=int:16 --prune-layers 20,21,22 input-model-f32.gguf pruned-model-f32.gguf copy 8
+```
+
+## Memory/Disk Requirements
+
+When running the larger models, make sure you have enough disk space to store all the intermediate files.
+As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For exmaple (Llama 3.1):
+
+| Model | Original size | Quantized size (Q4_K_M) |
+| ----: | ------------: | ----------------------: |
+|    8B |       32.1 GB |                  4.9 GB |
+|   70B |      280.9 GB |                 43.1 GB |
+|  405B |    1,625.1 GB |                249.1 GB |
+
+
+## Quantization
+
+Several quantization methods are supported. They differ in the resulting model disk size and inference speed. For example,
+
+### [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
+
+| Measure                     | IQ1_S        | IQ1_M        | IQ2_XXS      | IQ2_XS        | IQ2_S         | IQ2_M        |
+| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
+| bits/weight                 |       2.0042 |       2.1460 |       2.3824 |        2.5882 |        2.7403 |       2.9294 |
+| size (GiB)                  |       1.87   |       2.01   |       2.23   |        2.42   |        2.56   |       2.74   |
+| prompt processing t/s @ 512 | 858.88 ±1.22 | 847.99 ±0.47 | 852.39 ±0.85 | 826.99 ±12.51 | 783.55 ±13.73 | 787.68 ±7.00 |
+| text generation t/s @ 128   |  79.73 ±0.79 |  72.92 ±0.14 |  79.86 ±0.22 |  78.04 ±0.46  |  77.30 ±2.47  |  74.44 ±0.15 |
+
+| Measure                     | IQ3_XXS      | IQ3_XS       | IQ3_S        | IQ3_M         | IQ4_XS        | IQ4_NL       |
+| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
+| bits/weight                 |       3.2548 |       3.4977 |       3.6606 |        3.7628 |        4.4597 |       4.6818 |
+| size (GiB)                  |       3.04   |       3.27   |       3.42   |        3.52   |        4.17   |       4.38   |
+| prompt processing t/s @ 512 | 813.88 ±6.53 | 708.71 ±1.26 | 798.78 ±8.81 | 768.70 ±13.73 | 771.80 ±11.38 | 806.03 ±7.07 |
+| text generation t/s @ 128   |  73.95 ±0.20 |  71.67 ±0.54 |  69.31 ±0.63 |  70.15 ±0.33  |  77.51 ±0.20  |  76.63 ±0.28 |
+
+
+| Measure                     | Q2_K_S       | Q2_K         | Q3_K_S       | Q3_K_M       | Q3_K_L       | Q4_K_S       |
+| --------------------------- | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ |
+| bits/weight                 |       2.9697 |       3.1593 |       3.6429 |       3.9960 |       4.2979 |       4.6672 |
+| size (GiB)                  |       2.78   |       2.95   |       3.41   |       3.74   |       4.02   |       4.36   |
+| prompt processing t/s @ 512 | 798.91 ±6.40 | 784.45 ±7.85 | 752.17 ±7.94 | 783.44 ±9.92 | 761.17 ±7.55 | 818.55 ±9.58 |
+| text generation t/s @ 128   |  90.01 ±0.12 |  79.85 ±0.20 |  69.84 ±0.18 |  71.68 ±0.22 |  69.38 ±0.49 |  76.71 ±0.20 |
+
+| Measure                     | Q4_K_S       | Q4_K_M        | Q5_K_S       | Q5_K_M       | Q6_K          | Q8_0         |
+| --------------------------- | ------------ | ------------- | ------------ | ------------ | ------------- | ------------ |
+| bits/weight                 |       4.6672 |        4.8944 |       5.5704 |       5.7036 |        6.5633 |       8.5008 |
+| size (GiB)                  |       4.36   |        4.58   |       5.21   |       5.33   |        6.14   |       7.95   |
+| prompt processing t/s @ 512 | 818.55 ±9.58 | 821.81 ±21.44 | 752.52 ±0.99 | 758.69 ±7.43 | 812.01 ±10.82 | 865.09 ±8.30 |
+| text generation t/s @ 128   |  76.71 ±0.20 |  71.93 ±1.52  |  69.53 ±0.18 |  67.23 ±1.08 |  58.67 ±3.13  |  50.93 ±0.08 |
+
+| Measure                     | F16          |
+| --------------------------- | ------------ |
+| bits/weight                 |      16.0005 |
+| size (GiB)                  |      14.96   |
+| prompt processing t/s @ 512 | 923.49 ±0.53 |
+| text generation t/s @ 128   |  29.17 ±0.04 |
+
+## Background information on llama-quantize
+
+- [k-quants](https://github.com/ggml-org/llama.cpp/pull/1684)
+- k-quants improvements and i-quants
+  - [#2707](https://github.com/ggml-org/llama.cpp/pull/2707)
+  - [#2807](https://github.com/ggml-org/llama.cpp/pull/2807)
+  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4773)
+  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4856)
+  - [#4861 - importance matrix](https://github.com/ggml-org/llama.cpp/pull/4861)
+  - [#4872 - MoE models](https://github.com/ggml-org/llama.cpp/pull/4872)
+  - [#4897 - 2-bit quantization](https://github.com/ggml-org/llama.cpp/pull/4897)
+  - [#4930 - imatrix for all k-quants](https://github.com/ggml-org/llama.cpp/pull/4930)
+  - [#4951 - imatrix on the GPU](https://github.com/ggml-org/llama.cpp/pull/4957)
+  - [#4969 - imatrix for legacy quants](https://github.com/ggml-org/llama.cpp/pull/4969)
+  - [#4996 - k-quants tuning](https://github.com/ggml-org/llama.cpp/pull/4996)
+  - [#5060 - Q3_K_XS](https://github.com/ggml-org/llama.cpp/pull/5060)
+  - [#5196 - 3-bit i-quants](https://github.com/ggml-org/llama.cpp/pull/5196)
+  - [quantization tuning](https://github.com/ggml-org/llama.cpp/pull/5320), [another one](https://github.com/ggml-org/llama.cpp/pull/5334), and [another one](https://github.com/ggml-org/llama.cpp/pull/5361)
diff --git a/llama.cpp/tools/quantize/quantize.cpp b/llama.cpp/tools/quantize/quantize.cpp
new file mode 100644
index 0000000..c0f4927
--- /dev/null
+++ b/llama.cpp/tools/quantize/quantize.cpp
@@ -0,0 +1,733 @@
+#include "common.h"
+#include "llama.h"
+#include "gguf.h"
+
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <map>
+#include <fstream>
+#include <cmath>
+#include <cctype>
+#include <algorithm>
+#include <filesystem>
+
+struct quant_option {
+    std::string name;
+    llama_ftype ftype;
+    std::string desc;
+};
+
+static const std::vector<quant_option> QUANT_OPTIONS = {
+    { "Q4_0",     LLAMA_FTYPE_MOSTLY_Q4_0,     " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
+    { "Q4_1",     LLAMA_FTYPE_MOSTLY_Q4_1,     " 4.78G, +0.4511 ppl @ Llama-3-8B",  },
+    { "MXFP4_MOE",LLAMA_FTYPE_MOSTLY_MXFP4_MOE," MXFP4 MoE",  },
+    { "Q5_0",     LLAMA_FTYPE_MOSTLY_Q5_0,     " 5.21G, +0.1316 ppl @ Llama-3-8B",  },
+    { "Q5_1",     LLAMA_FTYPE_MOSTLY_Q5_1,     " 5.65G, +0.1062 ppl @ Llama-3-8B",  },
+    { "IQ2_XXS",  LLAMA_FTYPE_MOSTLY_IQ2_XXS,  " 2.06 bpw quantization",            },
+    { "IQ2_XS",   LLAMA_FTYPE_MOSTLY_IQ2_XS,   " 2.31 bpw quantization",            },
+    { "IQ2_S",    LLAMA_FTYPE_MOSTLY_IQ2_S,    " 2.5  bpw quantization",            },
+    { "IQ2_M",    LLAMA_FTYPE_MOSTLY_IQ2_M,    " 2.7  bpw quantization",            },
+    { "IQ1_S",    LLAMA_FTYPE_MOSTLY_IQ1_S,    " 1.56 bpw quantization",            },
+    { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
+    { "TQ1_0",    LLAMA_FTYPE_MOSTLY_TQ1_0,    " 1.69 bpw ternarization",           },
+    { "TQ2_0",    LLAMA_FTYPE_MOSTLY_TQ2_0,    " 2.06 bpw ternarization",           },
+    { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
+    { "Q2_K_S",   LLAMA_FTYPE_MOSTLY_Q2_K_S,   " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
+    { "IQ3_XXS",  LLAMA_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            },
+    { "IQ3_S",    LLAMA_FTYPE_MOSTLY_IQ3_S,    " 3.44 bpw quantization",            },
+    { "IQ3_M",    LLAMA_FTYPE_MOSTLY_IQ3_M,    " 3.66 bpw quantization mix",        },
+    { "Q3_K",     LLAMA_FTYPE_MOSTLY_Q3_K_M,   "alias for Q3_K_M"                   },
+    { "IQ3_XS",   LLAMA_FTYPE_MOSTLY_IQ3_XS,   " 3.3 bpw quantization",             },
+    { "Q3_K_S",   LLAMA_FTYPE_MOSTLY_Q3_K_S,   " 3.41G, +1.6321 ppl @ Llama-3-8B",  },
+    { "Q3_K_M",   LLAMA_FTYPE_MOSTLY_Q3_K_M,   " 3.74G, +0.6569 ppl @ Llama-3-8B",  },
+    { "Q3_K_L",   LLAMA_FTYPE_MOSTLY_Q3_K_L,   " 4.03G, +0.5562 ppl @ Llama-3-8B",  },
+    { "IQ4_NL",   LLAMA_FTYPE_MOSTLY_IQ4_NL,   " 4.50 bpw non-linear quantization", },
+    { "IQ4_XS",   LLAMA_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", },
+    { "Q4_K",     LLAMA_FTYPE_MOSTLY_Q4_K_M,   "alias for Q4_K_M",                  },
+    { "Q4_K_S",   LLAMA_FTYPE_MOSTLY_Q4_K_S,   " 4.37G, +0.2689 ppl @ Llama-3-8B",  },
+    { "Q4_K_M",   LLAMA_FTYPE_MOSTLY_Q4_K_M,   " 4.58G, +0.1754 ppl @ Llama-3-8B",  },
+    { "Q5_K",     LLAMA_FTYPE_MOSTLY_Q5_K_M,   "alias for Q5_K_M",                  },
+    { "Q5_K_S",   LLAMA_FTYPE_MOSTLY_Q5_K_S,   " 5.21G, +0.1049 ppl @ Llama-3-8B",  },
+    { "Q5_K_M",   LLAMA_FTYPE_MOSTLY_Q5_K_M,   " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
+    { "Q6_K",     LLAMA_FTYPE_MOSTLY_Q6_K,     " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
+    { "Q8_0",     LLAMA_FTYPE_MOSTLY_Q8_0,     " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
+    { "F16",      LLAMA_FTYPE_MOSTLY_F16,      "14.00G, +0.0020 ppl @ Mistral-7B",  },
+    { "BF16",     LLAMA_FTYPE_MOSTLY_BF16,     "14.00G, -0.0050 ppl @ Mistral-7B",  },
+    { "F32",      LLAMA_FTYPE_ALL_F32,         "26.00G              @ 7B",          },
+    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
+    { "COPY",     LLAMA_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  },
+};
+
+// Quantization types. Changes to this struct must be replicated in llama-quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
+static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";
+
+// TODO: share with imatrix.cpp
+static const char * const LLM_KV_IMATRIX_DATASETS    = "imatrix.datasets";
+static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
+static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
+
+static bool striequals(const char * a, const char * b) {
+    while (*a && *b) {
+        if (std::tolower(*a) != std::tolower(*b)) {
+            return false;
+        }
+        a++; b++;
+    }
+    return *a == *b;
+}
+
+static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
+    std::string ftype_str;
+
+    for (auto ch : ftype_str_in) {
+        ftype_str.push_back(std::toupper(ch));
+    }
+    for (const auto & it : QUANT_OPTIONS) {
+        if (striequals(it.name.c_str(), ftype_str.c_str())) {
+            ftype = it.ftype;
+            ftype_str_out = it.name;
+            return true;
+        }
+    }
+    try {
+        int ftype_int = std::stoi(ftype_str);
+        for (const auto & it : QUANT_OPTIONS) {
+            if (it.ftype == ftype_int) {
+                ftype = it.ftype;
+                ftype_str_out = it.name;
+                return true;
+            }
+        }
+    }
+    catch (...) {
+        // stoi failed
+    }
+    return false;
+}
+
+[[noreturn]]
+static void usage(const char * executable) {
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
+    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n");
+    printf("       [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
+    printf("  --allow-requantize\n");
+    printf("                                      allow requantizing tensors that have already been quantized\n");
+    printf("                                      WARNING: this can severely reduce quality compared to quantizing\n");
+    printf("                                               from 16bit or 32bit!\n");
+    printf("  --leave-output-tensor\n");
+    printf("                                      leave output.weight un(re)quantized\n");
+    printf("                                      increases model size but may also increase quality, especially when requantizing\n");
+    printf("  --pure\n");
+    printf("                                      disable k-quant mixtures and quantize all tensors to the same type\n");
+    printf("  --imatrix file_name\n");
+    printf("                                      use data in file_name as importance matrix for quant optimizations\n");
+    printf("  --include-weights tensor_name\n");
+    printf("                                      use importance matrix for this/these tensor(s)\n");
+    printf("  --exclude-weights tensor_name\n");
+    printf("                                      do not use importance matrix for this/these tensor(s)\n");
+    printf("  --output-tensor-type ggml_type\n");
+    printf("                                      use this ggml_type for the output.weight tensor\n");
+    printf("  --token-embedding-type ggml_type\n");
+    printf("                                      use this ggml_type for the token embeddings tensor\n");
+    printf("  --tensor-type tensor_name=ggml_type\n");
+    printf("                                      quantize this tensor to this ggml_type\n");
+    printf("                                      this is an advanced option to selectively quantize tensors. may be specified multiple times.\n");
+    printf("                                      example: --tensor-type attn_q=q8_0\n");
+    printf("  --tensor-type-file tensor_types.txt\n");
+    printf("                                      list of tensors to quantize to a specific ggml_type\n");
+    printf("                                      this is an advanced option to selectively quantize a long list of tensors.\n");
+    printf("                                      the file should use the same format as above, separated by spaces or newlines.\n");
+    printf("  --prune-layers L0,L1,L2...\n");
+    printf("                                      comma-separated list of layer numbers to prune from the model\n");
+    printf("                                      WARNING: this is an advanced option, use with care.\n");
+    printf("  --keep-split\n");
+    printf("                                      generate quantized model in the same shards as input\n");
+    printf("  --override-kv KEY=TYPE:VALUE\n");
+    printf("                                      override model metadata by key in the quantized model. may be specified multiple times.\n");
+    printf("                                      WARNING: this is an advanced option, use with care.\n\n");
+    printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
+    printf("-----------------------------------------------------------------------------\n");
+    printf(" allowed quantization types\n");
+    printf("-----------------------------------------------------------------------------\n\n");
+    for (const auto & it : QUANT_OPTIONS) {
+        if (it.name != "COPY") {
+            printf("  %2d  or  ", it.ftype);
+        } else {
+            printf("          ");
+        }
+        printf("%-7s : %s\n", it.name.c_str(), it.desc.c_str());
+    }
+    exit(1);
+}
+
+static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
+    if (!in) {
+        printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
+        exit(1);
+    }
+    int n_entries;
+    in.read((char *)&n_entries, sizeof(n_entries));
+    if (in.fail() || n_entries < 1) {
+        printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
+        exit(1);
+    }
+    for (int i = 0; i < n_entries; ++i) {
+        int len; in.read((char *)&len, sizeof(len));
+        std::vector<char> name_as_vec(len+1);
+        in.read((char *)name_as_vec.data(), len);
+        if (in.fail()) {
+            printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
+            exit(1);
+        }
+        name_as_vec[len] = 0;
+        std::string name{name_as_vec.data()};
+        auto & e = imatrix_data[name];
+        int ncall;
+        in.read((char *)&ncall, sizeof(ncall));
+        int nval;
+        in.read((char *)&nval, sizeof(nval));
+        if (in.fail() || nval < 1) {
+            printf("%s: failed reading number of values for entry %d\n", __func__, i);
+            imatrix_data = {};
+            exit(1);
+        }
+        e.resize(nval);
+        in.read((char *)e.data(), nval*sizeof(float));
+        if (in.fail()) {
+            printf("%s: failed reading data for entry %d\n", __func__, i);
+            imatrix_data = {};
+            exit(1);
+        }
+        if (ncall > 0) {
+            for (auto & v : e) {
+                v /= ncall;
+            }
+        }
+
+        if (getenv("LLAMA_TRACE")) {
+            printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
+        }
+    }
+
+    // latest legacy imatrix version contains the dataset filename at the end of the file
+    int m_last_call = 0;
+    if (in.peek() != EOF) {
+        in.read((char *)&m_last_call, sizeof(m_last_call));
+        int dataset_len;
+        in.read((char *)&dataset_len, sizeof(dataset_len));
+        std::vector<char> dataset_as_vec(dataset_len);
+        in.read(dataset_as_vec.data(), dataset_len);
+        imatrix_datasets.resize(1);
+        imatrix_datasets[0].assign(dataset_as_vec.begin(), dataset_as_vec.end());
+        printf("%s: imatrix dataset='%s'\n", __func__, imatrix_datasets[0].c_str());
+    }
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
+    return m_last_call;
+}
+
+static int load_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+
+    struct ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false, // the data is needed
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
+    if (!ctx_gguf) {
+        fprintf(stderr, "%s: imatrix file '%s' is using old format\n", __func__, imatrix_file.c_str());
+        return load_legacy_imatrix(imatrix_file, imatrix_datasets, imatrix_data);
+    }
+    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
+    if (n_entries < 1) {
+        fprintf(stderr, "%s: no data in file %s\n", __func__, imatrix_file.c_str());
+        gguf_free(ctx_gguf);
+        ggml_free(ctx);
+        exit(1);
+    }
+
+    const int dataset_idx     = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS);
+    const int chunk_count_idx = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT);
+    const int chunk_size_idx  = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE);
+    if (dataset_idx < 0 || chunk_count_idx < 0 || chunk_size_idx < 0) {
+        fprintf(stderr, "%s: missing imatrix metadata in file %s\n", __func__, imatrix_file.c_str());
+        gguf_free(ctx_gguf);
+        ggml_free(ctx);
+        exit(1);
+    }
+
+    const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);
+
+    const std::string sums_suffix{ ".in_sum2" };
+    const std::string counts_suffix{ ".counts" };
+
+    // Using an ordered map to get a deterministic iteration order.
+    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+
+    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+        std::string name = cur->name;
+
+        if (name.empty()) { continue; }
+
+        if (string_remove_suffix(name, sums_suffix)) {
+            // in_sum2
+            sums_counts_for[std::move(name)].first = cur;
+        } else if (string_remove_suffix(name, counts_suffix)) {
+            // counts
+            sums_counts_for[std::move(name)].second = cur;
+        } else {
+            // ignore other tensors
+        }
+    }
+
+    for (const auto & sc : sums_counts_for) {
+        const        std::string & name   = sc.first;
+        const struct ggml_tensor * sums   = sc.second.first;
+        const struct ggml_tensor * counts = sc.second.second;
+
+        if (!sums || !counts) {
+            fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
+            gguf_free(ctx_gguf);
+            ggml_free(ctx);
+            exit(1);
+        }
+
+        const int64_t ne0 = sums->ne[0];
+        const int64_t ne1 = sums->ne[1];
+
+        auto & e = imatrix_data[name];
+        e.resize(ggml_nelements(sums));
+        float max_count = 0.0f;
+        for (int64_t j = 0; j < ne1; ++j) {
+            const float count = ((const float *) counts->data)[j];
+            if (count > 0.0f) {
+                for (int64_t i = 0; i < ne0; ++i) {
+                    e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count;
+                }
+            } else {
+                // Partial imatrix data, this tensor never got any input during calibration
+                for (int64_t i = 0; i < ne0; ++i) {
+                    e[j*ne0 + i] = 1;
+                }
+            }
+            if (count > max_count) {
+                max_count = count;
+            }
+        }
+        if (getenv("LLAMA_TRACE")) {
+            printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str());
+        }
+    }
+
+    int m_last_chunk = gguf_get_val_u32(ctx_gguf, chunk_count_idx);
+
+    int64_t n_datasets = gguf_get_arr_n(ctx_gguf, dataset_idx);
+    imatrix_datasets.reserve(n_datasets);
+    for (int64_t i = 0; i < n_datasets; ++i) {
+        imatrix_datasets.push_back(gguf_get_arr_str(ctx_gguf, dataset_idx, i));
+    }
+    printf("%s: imatrix datasets=['%s'", __func__, imatrix_datasets[0].c_str());
+    for (size_t i = 1; i < imatrix_datasets.size(); ++i) {
+        printf(", '%s'", imatrix_datasets[i].c_str());
+    }
+    printf("]\n");
+
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+
+    return m_last_chunk;
+}
+
+static int prepare_imatrix(const std::string & imatrix_file,
+        std::vector<std::string> & imatrix_dataset,
+        const std::vector<std::string> & included_weights,
+        const std::vector<std::string> & excluded_weights,
+        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+    int m_last_call = -1;
+    if (!imatrix_file.empty()) {
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
+    }
+    if (imatrix_data.empty()) {
+        return m_last_call;
+    }
+    if (!excluded_weights.empty()) {
+        for (const auto & name : excluded_weights) {
+            for (auto it = imatrix_data.begin(); it != imatrix_data.end();) {
+                auto pos = it->first.find(name);
+                if (pos != std::string::npos) {
+                    it = imatrix_data.erase(it);
+                } else {
+                    ++it;
+                }
+            }
+        }
+    }
+    if (!included_weights.empty()) {
+        std::unordered_map<std::string, std::vector<float>> tmp;
+        for (const auto & name : included_weights) {
+            for (auto & e : imatrix_data) {
+                auto pos = e.first.find(name);
+                if (pos != std::string::npos) {
+                    tmp.emplace(std::move(e));
+                }
+            }
+        }
+        imatrix_data = std::move(tmp);
+    }
+    if (!imatrix_data.empty()) {
+        printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
+    }
+    return m_last_call;
+}
+
+static ggml_type parse_ggml_type(const char * arg) {
+    for (int i = 0; i < GGML_TYPE_COUNT; ++i) {
+        auto type = (ggml_type)i;
+        const auto * name = ggml_type_name(type);
+        if (name && striequals(name, arg)) {
+            return type;
+        }
+    }
+    fprintf(stderr, "\n%s: invalid ggml_type '%s'\n\n", __func__, arg);
+    return GGML_TYPE_COUNT;
+}
+
+static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
+    const char * sep = strchr(data, '=');
+    if (sep == nullptr) {
+        printf("\n%s: malformed tensor type '%s'\n\n", __func__, data);
+        return false;
+    }
+
+    const size_t tn_len = sep - data;
+    if (tn_len == 0) {
+        printf("\n%s: missing tensor name\n\n", __func__);
+        return false;
+    }
+    if (const size_t qt_len = strlen(sep); qt_len == 1) {
+        printf("\n%s: missing quantization type\n\n", __func__);
+        return false;
+    }
+
+    std::string tn(data, tn_len);
+    std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
+    sep++;
+    tensor_quantization tqz;
+    tqz.name = tn;
+    tqz.quant = parse_ggml_type(sep);
+    tensor_type.emplace_back(std::move(tqz));
+    if (tqz.quant == GGML_TYPE_COUNT) {
+        printf("\n%s: invalid quantization type '%s'\n\n", __func__, sep);
+        return false;
+    }
+
+    return true;
+}
+
+static bool parse_tensor_type_file(const char * filename, std::vector<tensor_quantization> & tensor_type) {
+    std::ifstream file(filename);
+    if (!file) {
+        printf("\n%s: failed to open file '%s': %s\n\n", __func__, filename, std::strerror(errno));
+        return false;
+    }
+
+    std::string arg;
+    while (file >> arg) {
+        if (!parse_tensor_type(arg.c_str(), tensor_type)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers) {
+    if (!data) {
+        printf("\n%s: no layer pruning ids provided\n\n", __func__);
+        return false;
+    }
+
+    const auto block_ids = string_split<std::string>(data, ',');
+    for (const auto & block_id : block_ids) {
+        int id;
+        try {
+            id = std::stoi(block_id);
+        } catch (...) {
+            id = -1;
+        }
+        if (id < 0) {
+            printf("\n%s: invalid layer id '%s'\n\n", __func__, block_id.c_str());
+            return false;
+        }
+        prune_layers.emplace_back(id);
+    }
+
+    sort(prune_layers.begin(), prune_layers.end());
+    prune_layers.erase(std::unique(prune_layers.begin(), prune_layers.end()), prune_layers.end());
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 3) {
+        usage(argv[0]);
+    }
+
+    llama_model_quantize_params params = llama_model_quantize_default_params();
+
+    int arg_idx = 1;
+    std::string imatrix_file;
+    std::vector<std::string> included_weights, excluded_weights;
+    std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<tensor_quantization> tensor_types;
+    std::vector<int> prune_layers;
+
+    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
+        if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
+            params.quantize_output_tensor = false;
+        } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.output_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.token_embedding_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--tensor-type") == 0) {
+            if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--tensor-type-file") == 0) {
+            if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_types)) {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
+            if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
+            if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
+            params.allow_requantize = true;
+        } else if (strcmp(argv[arg_idx], "--pure") == 0) {
+            params.pure = true;
+        } else if (strcmp(argv[arg_idx], "--imatrix") == 0) {
+            if (arg_idx < argc-1) {
+                imatrix_file = argv[++arg_idx];
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
+            if (arg_idx < argc-1) {
+                included_weights.emplace_back(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) {
+            if (arg_idx < argc-1) {
+                excluded_weights.emplace_back(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
+            params.keep_split = true;
+        } else {
+            usage(argv[0]);
+        }
+    }
+
+    if (argc - arg_idx < 2) {
+        printf("%s: bad arguments\n", argv[0]);
+        usage(argv[0]);
+    }
+    if (!included_weights.empty() && !excluded_weights.empty()) {
+        usage(argv[0]);
+    }
+
+    std::vector<std::string> imatrix_datasets;
+    std::unordered_map<std::string, std::vector<float>> imatrix_data;
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
+    if (!imatrix_data.empty()) {
+        params.imatrix = &imatrix_data;
+        {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+            strncpy(kvo.val_str, imatrix_file.c_str(), 127);
+            kvo.val_str[127] = '\0';
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+        if (!imatrix_datasets.empty()) {
+            llama_model_kv_override kvo;
+            // TODO: list multiple datasets when there are more than one
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+            strncpy(kvo.val_str, imatrix_datasets[0].c_str(), 127);
+            kvo.val_str[127] = '\0';
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+
+        {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.val_i64 = imatrix_data.size();
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+
+        if (m_last_call > 0) {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.val_i64 = m_last_call;
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+    }
+    if (!kv_overrides.empty()) {
+        kv_overrides.emplace_back();
+        kv_overrides.back().key[0] = 0;
+        params.kv_overrides = &kv_overrides;
+    }
+    if (!tensor_types.empty()) {
+        params.tensor_types = &tensor_types;
+    }
+    if (!prune_layers.empty()) {
+        params.prune_layers = &prune_layers;
+    }
+
+    llama_backend_init();
+
+    // parse command line arguments
+    const std::string fname_inp = argv[arg_idx];
+    arg_idx++;
+    std::string fname_out;
+
+    std::string ftype_str;
+    std::string suffix = ".gguf";
+    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
+        std::string fpath;
+        const size_t pos = fname_inp.find_last_of("/\\");
+        if (pos != std::string::npos) {
+            fpath = fname_inp.substr(0, pos + 1);
+        }
+
+        // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+        fname_out = fpath + "ggml-model-" + ftype_str;
+        if (!params.keep_split) {
+            fname_out += suffix;
+        }
+        arg_idx++;
+        if (ftype_str == "COPY") {
+            params.only_copy = true;
+        }
+    } else {
+        fname_out = argv[arg_idx];
+        if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
+            fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
+        }
+        arg_idx++;
+
+        if (argc <= arg_idx) {
+            fprintf(stderr, "%s: missing ftype\n", __func__);
+            return 1;
+        }
+        if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
+            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[arg_idx]);
+            return 1;
+        }
+        if (ftype_str == "COPY") {
+           params.only_copy = true;
+        }
+        arg_idx++;
+    }
+
+    // parse nthreads
+    if (argc > arg_idx) {
+        try {
+            params.nthread = std::stoi(argv[arg_idx]);
+        }
+        catch (const std::exception & e) {
+            fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
+            return 1;
+        }
+    }
+
+    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
+         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
+        fprintf(stderr, "\n==========================================================================================================\n");
+        fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
+        fprintf(stderr, "==========================================================================================================\n\n\n");
+        return 1;
+    }
+
+    if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
+        fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
+        return 1;
+    }
+
+    print_build_info();
+
+    fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
+    if (params.nthread > 0) {
+        fprintf(stderr, " using %d threads", params.nthread);
+    }
+    fprintf(stderr, "\n");
+
+    const int64_t t_main_start_us = llama_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = llama_time_us();
+
+        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), &params)) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = llama_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = llama_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
+    }
+
+    llama_backend_free();
+
+    return 0;
+}
+
diff --git a/llama.cpp/tools/quantize/tests.sh b/llama.cpp/tools/quantize/tests.sh
new file mode 100644
index 0000000..2cae588
--- /dev/null
+++ b/llama.cpp/tools/quantize/tests.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+    echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
+    echo "example: $0 ../../build/bin ../../tmp"
+    exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+    TMP_DIR=$2
+else
+    TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/llama-gguf-split
+QUANTIZE=$1/llama-quantize
+MAIN=$1/llama-completion
+WORK_PATH=$TMP_DIR/quantize
+ROOT_DIR=$(realpath $(dirname $0)/../../)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
+
+# 1. Get a model
+(
+cd $WORK_PATH
+"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/Qwen3-0.6B-GGUF --file Qwen3-0.6B-Q8_0.gguf
+)
+echo PASS
+
+# 2. Split model
+$SPLIT --split-max-tensors 28  $WORK_PATH/Qwen3-0.6B-Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 3. Requant model with '--keep-split'
+$QUANTIZE --allow-requantize --keep-split $WORK_PATH/ggml-model-split-00001-of-00012.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
+echo PASS
+echo
+
+# 3a. Test the requanted model is loading properly
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-00001-of-00012.gguf -p "I believe the meaning of life is" --n-predict 32
+echo PASS
+echo
+
+# 4. Requant mode without '--keep-split'
+$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00012.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
+echo PASS
+echo
+
+# 4b. Test the requanted model is loading properly
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-merge.gguf -p "I believe the meaning of life is" --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
diff --git a/llama.cpp/tools/rpc/CMakeLists.txt b/llama.cpp/tools/rpc/CMakeLists.txt
new file mode 100644
index 0000000..20f114a
--- /dev/null
+++ b/llama.cpp/tools/rpc/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET rpc-server)
+add_executable(${TARGET} rpc-server.cpp)
+target_link_libraries(${TARGET} PRIVATE ggml)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/llama.cpp/tools/rpc/README.md b/llama.cpp/tools/rpc/README.md
new file mode 100644
index 0000000..afbb302
--- /dev/null
+++ b/llama.cpp/tools/rpc/README.md
@@ -0,0 +1,104 @@
+## Overview
+
+> [!IMPORTANT]
+> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
+> insecure. **Never run the RPC server on an open network or in a sensitive environment!**
+
+The `rpc-server` allows exposing `ggml` devices on a remote host.
+The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
+This can be used for distributed LLM inference with `llama.cpp` in the following way:
+
+```mermaid
+flowchart TD
+    rpcb<-->|TCP|srva
+    rpcb<-->|TCP|srvb
+    rpcb<-.->|TCP|srvn
+    subgraph hostn[Host N]
+    srvn[rpc-server]<-.->dev4["CUDA0"]
+    srvn[rpc-server]<-.->dev5["CPU"]
+    end
+    subgraph hostb[Host B]
+    srvb[rpc-server]<-->dev3["Metal"]
+    end
+    subgraph hosta[Host A]
+    srva[rpc-server]<-->dev["CUDA0"]
+    srva[rpc-server]<-->dev2["CUDA1"]
+    end
+    subgraph host[Main Host]
+    local["Local devices"]<-->ggml[llama-cli]
+    ggml[llama-cli]<-->rpcb[RPC backend]
+    end
+    style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
+    classDef devcls fill:#5B9BD5
+    class local,dev,dev2,dev3,dev4,dev5 devcls
+```
+
+By default, `rpc-server` exposes all available accelerator devices on the host.
+If there are no accelerators, it exposes a single `CPU` device.
+
+## Usage
+
+### Remote hosts
+
+On each remote host, build the backends for each accelerator by adding `-DGGML_RPC=ON` to the build options.
+For example, to build the `rpc-server` with support for CUDA accelerators:
+
+```bash
+mkdir build-rpc-cuda
+cd build-rpc-cuda
+cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
+cmake --build . --config Release
+```
+
+When started, the `rpc-server` will detect and expose all available `CUDA` devices:
+
+```bash
+$ bin/rpc-server
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 CUDA devices:
+  Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
+Starting RPC server v3.0.0
+  endpoint       : 127.0.0.1:50052
+  local cache    : n/a
+Devices:
+  CUDA0: NVIDIA GeForce RTX 5090 (32109 MiB, 31588 MiB free)
+```
+
+You can control the set of exposed CUDA devices with the `CUDA_VISIBLE_DEVICES` environment variable or the `--device` command line option. The following two commands have the same effect:
+```bash
+$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
+$ bin/rpc-server --device CUDA0 -p 50052
+```
+
+### Main host
+
+On the main host build `llama.cpp` with the backends for the local devices and add `-DGGML_RPC=ON` to the build options.
+Finally, when running `llama-cli` or `llama-server`, use the `--rpc` option to specify the host and port of each `rpc-server`:
+
+```bash
+$ llama-cli -hf ggml-org/gemma-3-1b-it-GGUF -ngl 99 --rpc 192.168.88.10:50052,192.168.88.11:50052
+```
+
+By default, llama.cpp distributes model weights and the KV cache across all available devices -- both local and remote -- in proportion to each device's available memory.
+You can override this behavior with the `--tensor-split` option and set custom proportions when splitting tensor data across devices.
+
+### Local cache
+
+The RPC server can use a local cache to store large tensors and avoid transferring them over the network.
+This can speed up model loading significantly, especially when using large models.
+To enable the cache, use the `-c` option:
+
+```bash
+$ bin/rpc-server -c
+```
+
+By default, the cache is stored in the `$HOME/.cache/llama.cpp/rpc` directory and can be controlled via the `LLAMA_CACHE` environment variable.
+
+### Troubleshooting
+
+Use the `GGML_RPC_DEBUG` environment variable to enable debug messages from `rpc-server`:
+```bash
+$ GGML_RPC_DEBUG=1 bin/rpc-server
+```
+
diff --git a/llama.cpp/tools/rpc/rpc-server.cpp b/llama.cpp/tools/rpc/rpc-server.cpp
new file mode 100644
index 0000000..521f796
--- /dev/null
+++ b/llama.cpp/tools/rpc/rpc-server.cpp
@@ -0,0 +1,336 @@
+#include "ggml-rpc.h"
+#ifdef _WIN32
+#  define NOMINMAX
+#  define DIRECTORY_SEPARATOR '\\'
+#  include <windows.h>
+#  include <fcntl.h>
+#  include <io.h>
+#else
+#  define DIRECTORY_SEPARATOR '/'
+#  include <unistd.h>
+#  include <sys/stat.h>
+#endif
+#include <string>
+#include <stdio.h>
+#include <vector>
+#include <algorithm>
+#include <thread>
+#include <regex>
+
+#if defined(__linux__)
+#include <sys/types.h>
+#include <pwd.h>
+#endif
+
+// NOTE: this is copied from common.cpp to avoid linking with libcommon
+#ifdef _WIN32
+static std::wstring utf8_to_wstring(const std::string & str) {
+    if (str.empty()) {
+        return std::wstring();
+    }
+
+    int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
+
+    if (size <= 0) {
+        return std::wstring();
+    }
+
+    std::wstring wstr(size, 0);
+    MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
+
+    return wstr;
+}
+#endif
+
+// NOTE: this is copied from common.cpp to avoid linking with libcommon
+// returns true if successful, false otherwise
+static bool fs_create_directory_with_parents(const std::string & path) {
+#ifdef _WIN32
+    std::wstring wpath = utf8_to_wstring(path);
+
+    // if the path already exists, check whether it's a directory
+    const DWORD attributes = GetFileAttributesW(wpath.c_str());
+    if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+        return true;
+    }
+
+    size_t pos_slash = 0;
+
+    // process path from front to back, procedurally creating directories
+    while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
+        const std::wstring subpath = wpath.substr(0, pos_slash);
+
+        pos_slash += 1;
+
+        // skip the drive letter, in some systems it can return an access denied error
+        if (subpath.length() == 2 && subpath[1] == ':') {
+            continue;
+        }
+
+        const bool success = CreateDirectoryW(subpath.c_str(), NULL);
+
+        if (!success) {
+            const DWORD error = GetLastError();
+
+            // if the path already exists, ensure that it's a directory
+            if (error == ERROR_ALREADY_EXISTS) {
+                const DWORD attributes = GetFileAttributesW(subpath.c_str());
+                if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+                    return false;
+                }
+            } else {
+                return false;
+            }
+        }
+    }
+
+    return true;
+#else
+    // if the path already exists, check whether it's a directory
+    struct stat info;
+    if (stat(path.c_str(), &info) == 0) {
+        return S_ISDIR(info.st_mode);
+    }
+
+    size_t pos_slash = 1; // skip leading slashes for directory creation
+
+    // process path from front to back, procedurally creating directories
+    while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
+        const std::string subpath = path.substr(0, pos_slash);
+        struct stat info;
+
+        // if the path already exists, ensure that it's a directory
+        if (stat(subpath.c_str(), &info) == 0) {
+            if (!S_ISDIR(info.st_mode)) {
+                return false;
+            }
+        } else {
+            // create parent directories
+            const int ret = mkdir(subpath.c_str(), 0755);
+            if (ret != 0) {
+                return false;
+            }
+        }
+
+        pos_slash += 1;
+    }
+
+    return true;
+#endif // _WIN32
+}
+
+// NOTE: this is copied from common.cpp to avoid linking with libcommon
+static std::string fs_get_cache_directory() {
+    std::string cache_directory = "";
+    auto ensure_trailing_slash = [](std::string p) {
+        // Make sure to add trailing slash
+        if (p.back() != DIRECTORY_SEPARATOR) {
+            p += DIRECTORY_SEPARATOR;
+        }
+        return p;
+    };
+    if (getenv("LLAMA_CACHE")) {
+        cache_directory = std::getenv("LLAMA_CACHE");
+    } else {
+#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
+        if (std::getenv("XDG_CACHE_HOME")) {
+            cache_directory = std::getenv("XDG_CACHE_HOME");
+        } else if (std::getenv("HOME")) {
+            cache_directory = std::getenv("HOME") + std::string("/.cache/");
+        } else {
+#if defined(__linux__)
+            /* no $HOME is defined, fallback to getpwuid */
+            struct passwd *pw = getpwuid(getuid());
+            if ((!pw) || (!pw->pw_dir)) {
+                throw std::runtime_error("Failed to find $HOME directory");
+            }
+
+            cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
+#else /* defined(__linux__) */
+            throw std::runtime_error("Failed to find $HOME directory");
+#endif /* defined(__linux__) */
+        }
+#elif defined(__APPLE__)
+        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
+#elif defined(_WIN32)
+        cache_directory = std::getenv("LOCALAPPDATA");
+#elif defined(__EMSCRIPTEN__)
+        GGML_ABORT("not implemented on this platform");
+#else
+#  error Unknown architecture
+#endif
+        cache_directory = ensure_trailing_slash(cache_directory);
+        cache_directory += "llama.cpp";
+    }
+    return ensure_trailing_slash(cache_directory);
+}
+
+struct rpc_server_params {
+    std::string              host        = "127.0.0.1";
+    int                      port        = 50052;
+    bool                     use_cache   = false;
+    int                      n_threads   = std::max(1U, std::thread::hardware_concurrency()/2);
+    std::vector<std::string> devices;
+};
+
+static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
+    fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                       show this help message and exit\n");
+    fprintf(stderr, "  -t, --threads N                  number of threads for the CPU device (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -d, --device <dev1,dev2,...>     comma-separated list of devices\n");
+    fprintf(stderr, "  -H, --host HOST                  host to bind to (default: %s)\n", params.host.c_str());
+    fprintf(stderr, "  -p, --port PORT                  port to bind to (default: %d)\n", params.port);
+    fprintf(stderr, "  -c, --cache                      enable local file cache\n");
+    fprintf(stderr, "\n");
+}
+
+static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg == "-H" || arg == "--host") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.host = argv[i];
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.n_threads = std::stoi(argv[i]);
+            if (params.n_threads <= 0) {
+                fprintf(stderr, "error: invalid number of threads: %d\n", params.n_threads);
+                return false;
+            }
+        } else if (arg == "-d" || arg == "--device") {
+            if (++i >= argc) {
+                return false;
+            }
+            const std::regex regex{ R"([,/]+)" };
+            std::string dev_str = argv[i];
+            std::sregex_token_iterator iter(dev_str.begin(), dev_str.end(), regex, -1);
+            std::sregex_token_iterator end;
+            for ( ; iter != end; ++iter) {
+                try {
+                    params.devices.push_back(*iter);
+                } catch (const std::exception & ) {
+                    fprintf(stderr, "error: invalid device: %s\n", iter->str().c_str());
+                    return false;
+                }
+            }
+        } else if (arg == "-p" || arg == "--port") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.port = std::stoi(argv[i]);
+            if (params.port <= 0 || params.port > 65535) {
+                return false;
+            }
+        } else if (arg == "-c" || arg == "--cache") {
+            params.use_cache = true;
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+    return true;
+}
+
+static std::vector<ggml_backend_dev_t> get_devices(const rpc_server_params & params) {
+    std::vector<ggml_backend_dev_t> devices;
+    if (!params.devices.empty()) {
+        for (auto device : params.devices) {
+            ggml_backend_dev_t dev = ggml_backend_dev_by_name(device.c_str());
+            if (dev) {
+                devices.push_back(dev);
+            } else {
+                fprintf(stderr, "error: unknown device: %s\n", device.c_str());
+                fprintf(stderr, "available devices:\n");
+                for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+                    auto * dev = ggml_backend_dev_get(i);
+                    size_t free, total;
+                    ggml_backend_dev_memory(dev, &free, &total);
+                    printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+                }
+                return {};
+            }
+        }
+    }
+
+    // Try non-CPU devices first
+    if (devices.empty()) {
+        for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
+                devices.push_back(dev);
+            }
+        }
+    }
+
+    // If there are no accelerators, fallback to CPU device
+    if (devices.empty()) {
+        ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        if (dev) {
+            devices.push_back(dev);
+        }
+    }
+
+    return devices;
+}
+
+int main(int argc, char * argv[]) {
+    ggml_backend_load_all();
+
+    rpc_server_params params;
+    if (!rpc_server_params_parse(argc, argv, params)) {
+        fprintf(stderr, "Invalid parameters\n");
+        return 1;
+    }
+
+    if (params.host != "127.0.0.1") {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+        fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
+        fprintf(stderr, "         Never expose the RPC server to an open network!\n");
+        fprintf(stderr, "         This is an experimental feature and is not secure!\n");
+        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+        fprintf(stderr, "\n");
+    }
+
+    auto devices = get_devices(params);
+    if (devices.empty()) {
+        fprintf(stderr, "No devices found\n");
+        return 1;
+    }
+    std::string endpoint = params.host + ":" + std::to_string(params.port);
+    const char * cache_dir = nullptr;
+    std::string cache_dir_str;
+    if (params.use_cache) {
+        cache_dir_str = fs_get_cache_directory() + "rpc/";
+        if (!fs_create_directory_with_parents(cache_dir_str)) {
+            fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
+            return 1;
+        }
+        cache_dir = cache_dir_str.c_str();
+    }
+
+    ggml_backend_reg_t reg = ggml_backend_reg_by_name("RPC");
+    if (!reg) {
+        fprintf(stderr, "Failed to find RPC backend\n");
+        return 1;
+    }
+
+    auto start_server_fn = (decltype(ggml_backend_rpc_start_server)*) ggml_backend_reg_get_proc_address(reg, "ggml_backend_rpc_start_server");
+    if (!start_server_fn) {
+        fprintf(stderr, "Failed to obtain RPC backend start server function\n");
+        return 1;
+    }
+
+    start_server_fn(endpoint.c_str(), cache_dir, params.n_threads, devices.size(), devices.data());
+    return 0;
+}
diff --git a/llama.cpp/tools/server/CMakeLists.txt b/llama.cpp/tools/server/CMakeLists.txt
new file mode 100644
index 0000000..a39b4c5
--- /dev/null
+++ b/llama.cpp/tools/server/CMakeLists.txt
@@ -0,0 +1,70 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+
+# server-context containing the core server logic, used by llama-server and CLI
+
+set(TARGET server-context)
+
+add_library(${TARGET} STATIC
+    server-task.cpp
+    server-task.h
+    server-queue.cpp
+    server-queue.h
+    server-common.cpp
+    server-common.h
+    server-context.cpp
+    server-context.h
+)
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+target_include_directories(${TARGET} PRIVATE ../mtmd)
+target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
+target_link_libraries(${TARGET} PUBLIC common mtmd ${CMAKE_THREAD_LIBS_INIT})
+
+
+# llama-server executable
+
+set(TARGET llama-server)
+
+if (NOT LLAMA_HTTPLIB)
+    message(FATAL_ERROR "LLAMA_HTTPLIB is OFF, cannot build llama-server. Hint: to skip building server, set -DLLAMA_BUILD_SERVER=OFF")
+endif()
+
+set(TARGET_SRCS
+    server.cpp
+    server-http.cpp
+    server-http.h
+    server-models.cpp
+    server-models.h
+)
+set(PUBLIC_ASSETS
+    index.html.gz
+    loading.html
+)
+
+foreach(asset ${PUBLIC_ASSETS})
+    set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
+    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
+    list(APPEND TARGET_SRCS ${output})
+    add_custom_command(
+        DEPENDS "${input}"
+        OUTPUT "${output}"
+        COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
+    )
+    set_source_files_properties(${output} PROPERTIES GENERATED TRUE)
+endforeach()
+
+add_executable(${TARGET} ${TARGET_SRCS})
+install(TARGETS ${TARGET} RUNTIME)
+
+target_include_directories(${TARGET} PRIVATE ../mtmd)
+target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
+target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
+
+if (WIN32)
+    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+endif()
+
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/llama.cpp/tools/server/README-dev.md b/llama.cpp/tools/server/README-dev.md
new file mode 100644
index 0000000..3fea304
--- /dev/null
+++ b/llama.cpp/tools/server/README-dev.md
@@ -0,0 +1,179 @@
+# llama-server Development Documentation
+
+This document provides an in-depth technical overview of `llama-server`, intended for maintainers and contributors.
+
+If you are an end user consuming `llama-server` as a product, please refer to the main [README](./README.md) instead.
+
+## Backend
+
+### Overview
+
+The server supports two primary operating modes:
+
+- **Inference mode**: The default mode for performing inference with a single loaded GGUF model.
+- **Router mode**: Enables management of multiple inference server instances behind a single API endpoint. Requests are automatically routed to the appropriate backend instance based on the requested model.
+
+The core architecture consists of the following components:
+
+- `server_context`: Holds the primary inference state, including the main `llama_context` and all active slots.
+- `server_slot`: An abstraction over a single “sequence” in llama.cpp, responsible for managing individual parallel inference requests.
+- `server_routes`: Middleware layer between `server_context` and the HTTP interface; handles JSON parsing/formatting and request routing logic.
+- `server_http_context`: Implements the HTTP server using `cpp-httplib`.
+- `server_queue`: Thread-safe queue used by HTTP workers to submit new tasks to `server_context`.
+- `server_response`: Thread-safe queue used by `server_context` to return results to HTTP workers.
+- `server_response_reader`: Higher-level wrapper around the two queues above for cleaner code.
+- `server_task`: Unit of work pushed into `server_queue`.
+- `server_task_result`: Unit of result pushed into `server_response`.
+- `server_tokens`: Unified representation of token sequences (supports both text and multimodal tokens); used by `server_task` and `server_slot`.
+- `server_prompt_checkpoint`: For recurrent (e.g., RWKV) and SWA models, stores snapshots of KV cache state. Enables reuse when subsequent requests share the same prompt prefix, saving redundant computation.
+- `server_models`: Standalone component for managing multiple backend instances (used in router mode). It is completely independent of `server_context`.
+
+```mermaid
+graph TD
+    API_User <--> server_http_context
+    server_http_context <-- router mode --> server_models
+    server_http_context <-- inference mode --> server_routes
+    server_routes -- server_task --> server_queue
+    subgraph server_context
+        server_queue --> server_slot
+        server_slot -- server_task_result --> server_response
+        server_slot[multiple server_slot]
+    end
+    server_response --> server_routes
+```
+
+### Batching
+
+The server context maintains a single batch shared across all slots. When `update_slots()` is invoked, the system iterates through all active slots to populate this batch. For each slot, either a generated token from the previous decoding step or available prompt tokens are added to the batch.
+
+Batching constraints apply: slots can only be batched together if they share compatible configurations. For instance, slots using a specific LoRA adapter can be batched with each other, but not with slots using a different LoRA adapter or no adapter at all.
+
+Once the batch reaches capacity or all slots have been processed, `llama_decode` is called to execute the inference. This operation represents the primary computational bottleneck in `update_slots()`.
+
+Following decoding, the system either retrieves embeddings or samples the next token using `common_sampler_sample`. If a slot has remaining prompt tokens to process, it yields until the next `update_slots()` iteration.
+
+### Thread Management
+
+`server_context` runs on a dedicated single thread. Because it is single-threaded, heavy post-processing (especially after token generation) should be avoided, as it directly impacts multi-sequence throughput.
+
+Each incoming HTTP request is handled by its own thread managed by the HTTP library. The following operations are performed in HTTP worker threads:
+
+- JSON request parsing
+- Chat template application
+- Tokenization
+- Conversion of `server_task_result` into final JSON response
+- Error formatting into JSON
+- Tracking of partial/incremental responses (e.g., streaming tool calls or reasoning steps)
+
+**Best practices to follow:**
+
+- All JSON formatting and chat template logic must stay in the HTTP layer.
+- Avoid passing raw JSON between the HTTP layer and `server_slot`. Instead, parse everything into native C++ types as early as possible.
+
+### Example trace of a request
+
+Here is an example trace of an API request for text completion:
+
+- A request arrives at the HTTP layer.
+- The request is routed to the corresponding handler inside `server_routes`. In this case, `handle_completions_impl` is invoked.
+- The handler parses the input request, constructs a new `server_task`, and passes it to `server_res_generator`.
+- `server_res_generator` creates a new `task_result_state` for each task:
+    - `task_result_state` stays in the HTTP layer, responsible for keeping track of the current state of the response (e.g., parsing tool calls or thinking messages).
+    - `server_task` is moved into `server_queue` inside `server_context`.
+- `server_context` launches the task by moving it into an available slot (see `launch_slot_with_task()`).
+- `update_slot()` processes the task as described in the "Batching" section above.
+- Results may be sent using `send_partial_response` or `send_final_response`, which creates a new `server_task_result` and pushes it to the response queue.
+- At the same time, `server_res_generator` listens to the response queue and retrieves this response.
+- As the response is stateless, `server_res_generator` calls `response->update()` to update the response with the current state.
+- `server_res_generator` then calls `response->to_json()` and passes the response to the HTTP layer.
+
+### Testing
+
+`llama-server` includes an automated test suite based on `pytest`.
+
+The framework automatically starts a `llama-server` instance, sends requests, and validates responses.
+
+For detailed instructions, see the [test documentation](./tests/README.md).
+
+### Notable Related PRs
+
+- Initial server implementation: https://github.com/ggml-org/llama.cpp/pull/1443
+- Parallel decoding support: https://github.com/ggml-org/llama.cpp/pull/3228
+- Refactor introducing `server_queue` and `server_response`: https://github.com/ggml-org/llama.cpp/pull/5065
+- Reranking endpoint: https://github.com/ggml-org/llama.cpp/pull/9510
+- Multimodal model support (`libmtmd`): https://github.com/ggml-org/llama.cpp/pull/12898
+- Unified KV cache handling: https://github.com/ggml-org/llama.cpp/pull/16736
+- Separation of HTTP logic into dedicated files: https://github.com/ggml-org/llama.cpp/pull/17216
+- Large-scale code base split into smaller files: https://github.com/ggml-org/llama.cpp/pull/17362
+- Introduction of router mode: https://github.com/ggml-org/llama.cpp/pull/17470
+- Speculative decoding: https://github.com/ggml-org/llama.cpp/pull/17808 and rework in https://github.com/ggml-org/llama.cpp/pull/17808
+- INI presets: https://github.com/ggml-org/llama.cpp/pull/17859 (+ refactoring: https://github.com/ggml-org/llama.cpp/pull/18169)
+- Sleeping mode: https://github.com/ggml-org/llama.cpp/pull/18228
+
+
+
+
+## Web UI
+
+The project includes a web-based user interface for interacting with `llama-server`. It supports both single-model (`MODEL` mode) and multi-model (`ROUTER` mode) operation.
+
+The SvelteKit-based Web UI is introduced in this PR: https://github.com/ggml-org/llama.cpp/pull/14839
+
+### Features
+
+-   **Chat interface** with streaming responses
+-   **Multi-model support** (ROUTER mode) - switch between models, auto-load on selection
+-   **Modality validation** - ensures selected model supports conversation's attachments (images, audio)
+-   **Conversation management** - branching, regeneration, editing with history preservation
+-   **Attachment support** - images, audio, PDFs (with vision/text fallback)
+-   **Configurable parameters** - temperature, top_p, etc. synced with server defaults
+-   **Dark/light theme**
+
+### Tech Stack
+
+-   **SvelteKit** - frontend framework with Svelte 5 runes for reactive state
+-   **TailwindCSS** + **shadcn-svelte** - styling and UI components
+-   **Vite** - build tooling
+-   **IndexedDB** (Dexie) - local storage for conversations
+-   **LocalStorage** - user settings persistence
+
+### Architecture
+
+The WebUI follows a layered architecture:
+
+```
+Routes → Components → Hooks → Stores → Services → Storage/API
+```
+
+-   **Stores** - reactive state management (`chatStore`, `conversationsStore`, `modelsStore`, `serverStore`, `settingsStore`)
+-   **Services** - stateless API/database communication (`ChatService`, `ModelsService`, `PropsService`, `DatabaseService`)
+-   **Hooks** - reusable logic (`useModelChangeValidation`, `useProcessingState`)
+
+For detailed architecture diagrams, see [`tools/server/webui/docs/`](webui/docs/):
+
+-   `high-level-architecture.mmd` - full architecture with all modules
+-   `high-level-architecture-simplified.mmd` - simplified overview
+-   `data-flow-simplified-model-mode.mmd` - data flow for single-model mode
+-   `data-flow-simplified-router-mode.mmd` - data flow for multi-model mode
+-   `flows/*.mmd` - detailed per-domain flows (chat, conversations, models, etc.)
+
+### Development
+
+```sh
+# make sure you have Node.js installed
+cd tools/server/webui
+npm i
+
+# run dev server (with hot reload)
+npm run dev
+
+# run tests
+npm run test
+
+# build production bundle
+npm run build
+```
+
+After `public/index.html.gz` has been generated, rebuild `llama-server` as described in the [build](#build) section to include the updated UI.
+
+**Note:** The Vite dev server automatically proxies API requests to `http://localhost:8080`. Make sure `llama-server` is running on that port during development.
diff --git a/llama.cpp/tools/server/README.md b/llama.cpp/tools/server/README.md
new file mode 100644
index 0000000..d132830
--- /dev/null
+++ b/llama.cpp/tools/server/README.md
@@ -0,0 +1,1782 @@
+# LLaMA.cpp HTTP Server
+
+Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**.
+
+Set of LLM REST APIs and a web UI to interact with llama.cpp.
+
+**Features:**
+ * LLM inference of F16 and quantized models on GPU and CPU
+ * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions, responses, and embeddings routes
+ * [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) compatible chat completions
+ * Reranking endpoint (https://github.com/ggml-org/llama.cpp/pull/9510)
+ * Parallel decoding with multi-user support
+ * Continuous batching
+ * Multimodal ([documentation](../../docs/multimodal.md)) / with OpenAI-compatible API support
+ * Monitoring endpoints
+ * Schema-constrained JSON response format
+ * Prefilling of assistant messages similar to the Claude API
+ * [Function calling](../../docs/function-calling.md) / tool use for ~any model
+ * Speculative decoding
+ * Easy-to-use web UI
+
+For the ful list of features, please refer to [server's changelog](https://github.com/ggml-org/llama.cpp/issues/9291)
+
+## Usage
+
+<!-- HELP_START -->
+
+<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->
+
+### Common params
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `-h, --help, --usage` | print usage and exit |
+| `--version` | show version and build info |
+| `--license` | show source code license and dependencies |
+| `-cl, --cache-list` | show list of models in cache |
+| `--completion-bash` | print source-able bash completion script for llama.cpp |
+| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
+| `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
+| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
+| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
+| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
+| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0) |
+| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0) |
+| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50) |
+| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
+| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
+| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
+| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
+| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
+| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
+| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)<br/>(env: LLAMA_ARG_N_PREDICT) |
+| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
+| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
+| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
+| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
+| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
+| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
+| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
+| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
+| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
+| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
+| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
+| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
+| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
+| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
+| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
+| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
+| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
+| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
+| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
+| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
+| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
+| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
+| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
+| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
+| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
+| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
+| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
+| `--list-devices` | print list of available devices and exit |
+| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type<br/>(env: LLAMA_ARG_OVERRIDE_TENSOR) |
+| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
+| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
+| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
+| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
+| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
+| `-fitt, --fit-target MiB0,MiB1,MiB2,...` | target margin per device for --fit, comma-separated list of values, single value is broadcast across all devices, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
+| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
+| `--check-tensors` | check model tensor data for invalid values (default: false) |
+| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
+| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
+| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
+| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
+| `--control-vector FNAME` | add a control vector<br/>note: use comma-separated values to add multiple control vectors |
+| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE<br/>note: use comma-separated values (format: FNAME:SCALE,...) |
+| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
+| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
+| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
+| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
+| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
+| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
+| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
+| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
+| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
+| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
+| `--log-disable` | Log disable |
+| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
+| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
+| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
+| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
+| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
+| `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
+| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
+| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
+| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
+
+
+### Sampling params
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
+| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
+| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
+| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
+| `--temp N` | temperature (default: 0.80) |
+| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
+| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
+| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
+| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
+| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
+| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
+| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
+| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
+| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
+| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
+| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) |
+| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) |
+| `--dry-base N` | set DRY sampling base value (default: 1.75) |
+| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
+| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
+| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
+| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) |
+| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.<br/>(valid range 0.0 to 0.99) (default: 0.90) |
+| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) |
+| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) |
+| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
+| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) |
+| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) |
+| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
+| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
+| `--grammar-file FNAME` | file to read grammar from |
+| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
+| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
+| `-bs, --backend-sampling` | enable backend sampling (experimental) (default: disabled)<br/>(env: LLAMA_ARG_BACKEND_SAMPLING) |
+
+
+### Server-specific params
+
+| Argument | Explanation |
+| -------- | ----------- |
+| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
+| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
+| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
+| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
+| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
+| `-sp, --special` | special tokens output enabled (default: false) |
+| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
+| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
+| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
+| `-np, --parallel N` | number of server slots (default: -1, -1 = auto)<br/>(env: LLAMA_ARG_N_PARALLEL) |
+| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
+| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
+| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
+| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
+| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
+| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
+| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
+| `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
+| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
+| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
+| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
+| `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
+| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
+| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
+| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
+| `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG) |
+| `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
+| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
+| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
+| `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
+| `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
+| `--api-key-file FNAME` | path to file containing API keys (default: none) |
+| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
+| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
+| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
+| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
+| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
+| `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)<br/>(env: LLAMA_ARG_CACHE_PROMPT) |
+| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
+| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
+| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
+| `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
+| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
+| `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) |
+| `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
+| `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
+| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
+| `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
+| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
+| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
+| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
+| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
+| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
+| `--sleep-idle-seconds SECONDS` | number of seconds of idleness after which the server will sleep (default: -1; -1 = disabled) |
+| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
+| `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
+| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
+| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
+| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
+| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
+| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
+| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
+| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
+| `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
+| `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall |
+| `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) |
+| `--fim-qwen-1.5b-default` | use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet) |
+| `--fim-qwen-3b-default` | use default Qwen 2.5 Coder 3B (note: can download weights from the internet) |
+| `--fim-qwen-7b-default` | use default Qwen 2.5 Coder 7B (note: can download weights from the internet) |
+| `--fim-qwen-7b-spec` | use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet) |
+| `--fim-qwen-14b-spec` | use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet) |
+| `--fim-qwen-30b-default` | use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet) |
+| `--gpt-oss-20b-default` | use gpt-oss-20b (note: can download weights from the internet) |
+| `--gpt-oss-120b-default` | use gpt-oss-120b (note: can download weights from the internet) |
+| `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) |
+| `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) |
+
+<!-- HELP_END -->
+
+Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
+
+For boolean options like `--mmap` or `--kv-offload`, the environment variable is handled as shown in this example:
+- `LLAMA_ARG_MMAP=true` means enabled, other accepted values are: `1`, `on`, `enabled`
+- `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled`
+- If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap
+
+Example usage of docker compose with environment variables:
+
+```yml
+services:
+  llamacpp-server:
+    image: ghcr.io/ggml-org/llama.cpp:server
+    ports:
+      - 8080:8080
+    volumes:
+      - ./models:/models
+    environment:
+      # alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
+      LLAMA_ARG_MODEL: /models/my_model.gguf
+      LLAMA_ARG_CTX_SIZE: 4096
+      LLAMA_ARG_N_PARALLEL: 2
+      LLAMA_ARG_ENDPOINT_METRICS: 1
+      LLAMA_ARG_PORT: 8080
+```
+
+### Multimodal support
+
+Multimodal support was added in [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) and is currently an experimental feature.
+It is currently available in the following endpoints:
+- The OAI-compatible chat endpoint.
+- The non-OAI-compatible completions endpoint.
+- The non-OAI-compatible embeddings endpoint.
+
+For more details, please refer to [multimodal documentation](../../docs/multimodal.md)
+
+## Build
+
+`llama-server` is built alongside everything else from the root of the project
+
+- Using `CMake`:
+
+  ```bash
+  cmake -B build
+  cmake --build build --config Release -t llama-server
+  ```
+
+  Binary is at `./build/bin/llama-server`
+
+## Build with SSL
+
+`llama-server` can also be built with SSL support using OpenSSL 3
+
+- Using `CMake`:
+
+  ```bash
+  cmake -B build -DLLAMA_OPENSSL=ON
+  cmake --build build --config Release -t llama-server
+  ```
+
+## Quick Start
+
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+
+### Unix-based systems (Linux, macOS, etc.)
+
+```bash
+./llama-server -m models/7B/ggml-model.gguf -c 2048
+```
+
+### Windows
+
+```powershell
+llama-server.exe -m models\7B\ggml-model.gguf -c 2048
+```
+
+The above command will start a server that by default listens on `127.0.0.1:8080`.
+You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
+
+### Docker
+
+```bash
+docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
+
+# or, with CUDA:
+docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggml-org/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
+```
+
+## Using with CURL
+
+Using [curl](https://curl.se/). On Windows, `curl.exe` should be available in the base OS.
+
+```sh
+curl --request POST \
+    --url http://localhost:8080/completion \
+    --header "Content-Type: application/json" \
+    --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
+```
+
+## API Endpoints
+
+### GET `/health`: Returns health check result
+
+This endpoint is public (no API key check). `/v1/health` also works.
+
+**Response format**
+
+- HTTP status code 503
+  - Body: `{"error": {"code": 503, "message": "Loading model", "type": "unavailable_error"}}`
+  - Explanation: the model is still being loaded.
+- HTTP status code 200
+  - Body: `{"status": "ok" }`
+  - Explanation: the model is successfully loaded and the server is ready.
+
+### POST `/completion`: Given a `prompt`, it returns the predicted completion.
+
+> [!IMPORTANT]
+>
+> This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/completions` instead.
+
+*Options:*
+
+`prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true:
+
+  - The prompt is a string or an array with the first element given as a string
+  - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
+
+These input shapes and data type are allowed for `prompt`:
+
+  - Single string: `"string"`
+  - Single sequence of tokens: `[12, 34, 56]`
+  - Mixed tokens and strings: `[12, 34, "string", 56, 78]`
+  - A JSON object which optionally contains multimodal data: `{ "prompt_string": "string", "multimodal_data": ["base64"] }`
+
+Multiple prompts are also supported. In this case, the completion result will be an array.
+
+  - Only strings: `["string1", "string2"]`
+  - Strings, JSON objects, and sequences of tokens: `["string1", [12, 34, 56], { "prompt_string": "string", "multimodal_data": ["base64"]}]`
+  - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string", { "prompt_string": "string" }]`
+
+Note for `multimodal_data` in JSON object prompts. This should be an array of strings, containing base64 encoded multimodal data such as images and audio. There must be an identical number of MTMD media markers in the string prompt element which act as placeholders for the data provided to this parameter. The multimodal data files will be substituted in order. The marker string (e.g. `<__media__>`) can be found by calling `mtmd_default_marker()` defined in [the MTMD C API](https://github.com/ggml-org/llama.cpp/blob/5fd160bbd9d70b94b5b11b0001fd7f477005e4a0/tools/mtmd/mtmd.h#L87). A client *must not* specify this field unless the server has the multimodal capability. Clients should check `/models` or `/v1/models` for the `multimodal` capability before a multimodal request.
+
+`temperature`: Adjust the randomness of the generated text. Default: `0.8`
+
+`dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.
+
+`dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`
+
+`top_k`: Limit the next token selection to the K most probable tokens.  Default: `40`
+
+`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
+
+`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
+
+`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
+
+`n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0`
+
+`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
+By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
+
+`n_cmpl`: Number of completions to generate from the current prompt. If input has multiple prompts, the output will have N prompts times `n_cmpl` entries.
+
+`n_cache_reuse`: Min chunk size to attempt reusing from the cache via KV shifting. For more info, see `--cache-reuse` arg. Default: `0`, which is disabled.
+
+`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`.
+
+`stop`: Specify a JSON array of stopping strings.
+These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
+
+`typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
+
+`repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`
+
+`repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
+
+`presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
+
+`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
+
+`dry_multiplier`: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
+
+`dry_base`: Set the DRY repetition penalty base value. Default: `1.75`
+
+`dry_allowed_length`: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
+
+`dry_penalty_last_n`: How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
+
+`dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
+
+`xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
+
+`xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
+
+`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
+
+`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
+
+`mirostat_eta`: Set the Mirostat learning rate, parameter eta.  Default: `0.1`
+
+`grammar`: Set grammar for grammar-based sampling.  Default: no grammar
+
+`json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.
+
+`seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.
+
+`ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`
+
+`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. For compatibility with the OpenAI API, a JSON object {"<string or token id>": bias, ...} can also be passed. Default: `[]`
+
+`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
+
+`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
+
+`t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.
+
+`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
+
+`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
+
+`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false`
+
+`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
+
+`timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
+
+`return_progress`: Include prompt processing progress in `stream` mode. The progress will be contained inside `prompt_progress` with 4 values: `total`, `cache`, `processed`, and `time_ms`. The overall progress is `processed/total`, while the actual timed progress is `(processed-cache)/(total-cache)`. The `time_ms` field contains the elapsed time in milliseconds since prompt processing started. Default: `false`
+
+`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
+
+`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. Note that fields with a slash will be unnested; for example, `generation_settings/n_predict` will move the field `n_predict` from the `generation_settings` object to the root of the response and give it a new name.
+
+`lora`: A list of LoRA adapters to be applied to this specific request. Each object in the list must contain `id` and `scale` fields. For example: `[{"id": 0, "scale": 0.5}, {"id": 1, "scale": 1.1}]`. If a LoRA adapter is not specified in the list, its scale will default to `0.0`. Please note that requests with different LoRA configurations will not be batched together, which may result in performance degradation.
+
+**Response format**
+
+- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
+
+- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements:
+  ```
+  {
+    "content": "<the generated completion text>",
+    "tokens": [ generated token ids if requested ],
+    ...
+    "probs": [
+      {
+        "id": <token id>,
+        "logprob": float,
+        "token": "<most likely token>",
+        "bytes": [int, int, ...],
+        "top_logprobs": [
+          {
+            "id": <token id>,
+            "logprob": float,
+            "token": "<token text>",
+            "bytes": [int, int, ...],
+          },
+          {
+            "id": <token id>,
+            "logprob": float,
+            "token": "<token text>",
+            "bytes": [int, int, ...],
+          },
+          ...
+        ]
+      },
+      {
+        "id": <token id>,
+        "logprob": float,
+        "token": "<most likely token>",
+        "bytes": [int, int, ...],
+        "top_logprobs": [
+          ...
+        ]
+      },
+      ...
+    ]
+  },
+  ```
+  Please note that if `post_sampling_probs` is set to `true`:
+    - `logprob` will be replaced with `prob`, with the value between 0.0 and 1.0
+    - `top_logprobs` will be replaced with `top_probs`. Each element contains:
+      - `id`: token ID
+      - `token`: token in string
+      - `bytes`: token in bytes
+      - `prob`: token probability, with the value between 0.0 and 1.0
+    - Number of elements in `top_probs` may be less than `n_probs`
+
+- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
+- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request.
+- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
+- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
+- `model`: The model alias (for model path, please use `/props` endpoint)
+- `prompt`: The processed `prompt` (special tokens may be added)
+- `stop_type`: Indicating whether the completion has stopped. Possible values are:
+  - `none`: Generating (not stopped)
+  - `eos`: Stopped because it encountered the EOS token
+  - `limit`: Stopped because `n_predict` tokens were generated before stop words or EOS was encountered
+  - `word`: Stopped due to encountering a stopping word from `stop` JSON array provided
+- `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
+- `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
+- `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion
+- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
+- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
+
+
+### POST `/tokenize`: Tokenize a given text
+
+*Options:*
+
+`content`: (Required) The text to tokenize.
+
+`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+
+`parse_special`: (Optional) Boolean indicating if special tokens should be tokenized. When `false` special tokens are treated as plaintext.  Default: `true`
+
+`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
+
+**Response:**
+
+Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
+
+
+If `with_pieces` is `false`:
+```json
+{
+  "tokens": [123, 456, 789]
+}
+```
+
+If `with_pieces` is `true`:
+```json
+{
+  "tokens": [
+    {"id": 123, "piece": "Hello"},
+    {"id": 456, "piece": " world"},
+    {"id": 789, "piece": "!"}
+  ]
+}
+```
+
+With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
+```
+{
+  "tokens": [
+    {"id": 198, "piece": [195]}, // hex C3
+    {"id": 164, "piece": [161]} // hex A1
+  ]
+}
+```
+
+### POST `/detokenize`: Convert tokens to text
+
+*Options:*
+
+`tokens`: Set the tokens to detokenize.
+
+### POST `/apply-template`: Apply chat template to a conversation
+
+Uses the server's prompt template formatting functionality to convert chat messages to a single string expected by a chat model as input, but does not perform inference. Instead, the prompt string is returned in the `prompt` field of the JSON response. The prompt can then be modified as desired (for example, to insert "Sure!" at the beginning of the model's response) before sending to `/completion` to generate the chat response.
+
+*Options:*
+
+`messages`: (Required) Chat turns in the same format as `/v1/chat/completions`.
+
+**Response format**
+
+Returns a JSON object with a field `prompt` containing a string of the input messages formatted according to the model's chat template format.
+
+### POST `/embedding`: Generate embedding of a given text
+
+> [!IMPORTANT]
+>
+> This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/embeddings` instead.
+
+The same as [the embedding example](../embedding) does.
+
+This endpoint also supports multimodal embeddings. See the documentation for the `/completions` endpoint for details on how to send a multimodal prompt.
+
+*Options:*
+
+`content`: Set the text to process.
+
+`embd_normalize`: Normalization for pooled embeddings. Can be one of the following values:
+```
+  -1: No normalization
+   0: Max absolute
+   1: Taxicab
+   2: Euclidean/L2
+  >2: P-Norm
+```
+
+### POST `/reranking`: Rerank documents according to a given query
+
+Similar to https://jina.ai/reranker/ but might change in the future.
+Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)) and the `--embedding --pooling rank` options.
+
+*Options:*
+
+`query`: The query against which the documents will be ranked.
+
+`documents`: An array strings representing the documents to be ranked.
+
+*Aliases:*
+  - `/rerank`
+  - `/v1/rerank`
+  - `/v1/reranking`
+
+*Examples:*
+
+```shell
+curl http://127.0.0.1:8012/v1/rerank \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "some-model",
+            "query": "What is panda?",
+            "top_n": 3,
+            "documents": [
+                "hi",
+            "it is a bear",
+            "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
+            ]
+    }' | jq
+```
+
+### POST `/infill`: For code infilling.
+
+Takes a prefix and a suffix and returns the predicted completion as stream.
+
+*Options:*
+
+- `input_prefix`: Set the prefix of the code to infill.
+- `input_suffix`: Set the suffix of the code to infill.
+- `input_extra`:  Additional context inserted before the FIM prefix.
+- `prompt`:       Added after the `FIM_MID` token
+
+`input_extra` is array of `{"filename": string, "text": string}` objects.
+
+The endpoint also accepts all the options of `/completion`.
+
+If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used:
+
+```txt
+<FIM_REP>myproject
+<FIM_SEP>{chunk 0 filename}
+{chunk 0 text}
+<FIM_SEP>{chunk 1 filename}
+{chunk 1 text}
+...
+<FIM_SEP>filename
+<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
+```
+
+If the tokens are missing, then the extra context is simply prefixed at the start:
+
+```txt
+[input_extra]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
+```
+
+### **GET** `/props`: Get server global properties.
+
+By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
+
+**Response format**
+
+```json
+{
+  "default_generation_settings": {
+    "id": 0,
+    "id_task": -1,
+    "n_ctx": 1024,
+    "speculative": false,
+    "is_processing": false,
+    "params": {
+      "n_predict": -1,
+      "seed": 4294967295,
+      "temperature": 0.800000011920929,
+      "dynatemp_range": 0.0,
+      "dynatemp_exponent": 1.0,
+      "top_k": 40,
+      "top_p": 0.949999988079071,
+      "min_p": 0.05000000074505806,
+      "xtc_probability": 0.0,
+      "xtc_threshold": 0.10000000149011612,
+      "typical_p": 1.0,
+      "repeat_last_n": 64,
+      "repeat_penalty": 1.0,
+      "presence_penalty": 0.0,
+      "frequency_penalty": 0.0,
+      "dry_multiplier": 0.0,
+      "dry_base": 1.75,
+      "dry_allowed_length": 2,
+      "dry_penalty_last_n": -1,
+      "dry_sequence_breakers": [
+        "\n",
+        ":",
+        "\"",
+        "*"
+      ],
+      "mirostat": 0,
+      "mirostat_tau": 5.0,
+      "mirostat_eta": 0.10000000149011612,
+      "stop": [],
+      "max_tokens": -1,
+      "n_keep": 0,
+      "n_discard": 0,
+      "ignore_eos": false,
+      "stream": true,
+      "n_probs": 0,
+      "min_keep": 0,
+      "grammar": "",
+      "samplers": [
+        "dry",
+        "top_k",
+        "typ_p",
+        "top_p",
+        "min_p",
+        "xtc",
+        "temperature"
+      ],
+      "speculative.n_max": 16,
+      "speculative.n_min": 5,
+      "speculative.p_min": 0.8999999761581421,
+      "timings_per_token": false
+    },
+    "prompt": "",
+    "next_token": {
+      "has_next_token": true,
+      "has_new_line": false,
+      "n_remain": -1,
+      "n_decoded": 0,
+      "stopping_word": ""
+    }
+  },
+  "total_slots": 1,
+  "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
+  "chat_template": "...",
+  "chat_template_caps": {},
+  "modalities": {
+    "vision": false
+  },
+  "build_info": "b(build number)-(build commit hash)",
+  "is_sleeping": false
+}
+```
+
+- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
+- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
+- `model_path` - the path to model file (same with `-m` argument)
+- `chat_template` - the model's original Jinja2 prompt template
+- `chat_template_caps` - capabilities of the chat template (see `common/jinja/caps.h` for more info)
+- `modalities` - the list of supported modalities
+- `is_sleeping` - sleeping status, see [Sleeping on idle](#sleeping-on-idle)
+
+### POST `/props`: Change server global properties.
+
+To use this endpoint with POST method, you need to start server with `--props`
+
+*Options:*
+
+- None yet
+
+### POST `/embeddings`: non-OpenAI-compatible embeddings API
+
+This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidean norm.
+
+Note that the response format of this endpoint is different from `/v1/embeddings`.
+
+*Options:*
+
+Same as the `/v1/embeddings` endpoint.
+
+*Examples:*
+
+Same as the `/v1/embeddings` endpoint.
+
+**Response format**
+
+```
+[
+  {
+    "index": 0,
+    "embedding": [
+      [ ... embeddings for token 0   ... ],
+      [ ... embeddings for token 1   ... ],
+      [ ... ]
+      [ ... embeddings for token N-1 ... ],
+    ]
+  },
+  ...
+  {
+    "index": P,
+    "embedding": [
+      [ ... embeddings for token 0   ... ],
+      [ ... embeddings for token 1   ... ],
+      [ ... ]
+      [ ... embeddings for token N-1 ... ],
+    ]
+  }
+]
+```
+
+### GET `/slots`: Returns the current slots processing state
+
+This endpoint is enabled by default and can be disabled with `--no-slots`. It can be used to query various per-slot metrics, such as speed, processed tokens, sampling parameters, etc.
+
+If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
+
+**Response format**
+
+<details>
+<summary>Example with 2 slots</summary>
+
+```json
+[
+  {
+    "id": 0,
+    "id_task": 135,
+    "n_ctx": 65536,
+    "speculative": false,
+    "is_processing": true,
+    "params": {
+      "n_predict": -1,
+      "seed": 4294967295,
+      "temperature": 0.800000011920929,
+      "dynatemp_range": 0.0,
+      "dynatemp_exponent": 1.0,
+      "top_k": 40,
+      "top_p": 0.949999988079071,
+      "min_p": 0.05000000074505806,
+      "top_n_sigma": -1.0,
+      "xtc_probability": 0.0,
+      "xtc_threshold": 0.10000000149011612,
+      "typical_p": 1.0,
+      "repeat_last_n": 64,
+      "repeat_penalty": 1.0,
+      "presence_penalty": 0.0,
+      "frequency_penalty": 0.0,
+      "dry_multiplier": 0.0,
+      "dry_base": 1.75,
+      "dry_allowed_length": 2,
+      "dry_penalty_last_n": 131072,
+      "mirostat": 0,
+      "mirostat_tau": 5.0,
+      "mirostat_eta": 0.10000000149011612,
+      "max_tokens": -1,
+      "n_keep": 0,
+      "n_discard": 0,
+      "ignore_eos": false,
+      "stream": true,
+      "n_probs": 0,
+      "min_keep": 0,
+      "chat_format": "GPT-OSS",
+      "reasoning_format": "none",
+      "reasoning_in_content": false,
+      "thinking_forced_open": false,
+      "samplers": [
+        "penalties",
+        "dry",
+        "top_k",
+        "typ_p",
+        "top_p",
+        "min_p",
+        "xtc",
+        "temperature"
+      ],
+      "speculative.n_max": 16,
+      "speculative.n_min": 0,
+      "speculative.p_min": 0.75,
+      "timings_per_token": false,
+      "post_sampling_probs": false,
+      "lora": []
+    },
+    "next_token": {
+      "has_next_token": true,
+      "has_new_line": false,
+      "n_remain": -1,
+      "n_decoded": 0
+    }
+  },
+  {
+    "id": 1,
+    "id_task": 0,
+    "n_ctx": 65536,
+    "speculative": false,
+    "is_processing": true,
+    "params": {
+      "n_predict": -1,
+      "seed": 4294967295,
+      "temperature": 0.800000011920929,
+      "dynatemp_range": 0.0,
+      "dynatemp_exponent": 1.0,
+      "top_k": 40,
+      "top_p": 0.949999988079071,
+      "min_p": 0.05000000074505806,
+      "top_n_sigma": -1.0,
+      "xtc_probability": 0.0,
+      "xtc_threshold": 0.10000000149011612,
+      "typical_p": 1.0,
+      "repeat_last_n": 64,
+      "repeat_penalty": 1.0,
+      "presence_penalty": 0.0,
+      "frequency_penalty": 0.0,
+      "dry_multiplier": 0.0,
+      "dry_base": 1.75,
+      "dry_allowed_length": 2,
+      "dry_penalty_last_n": 131072,
+      "mirostat": 0,
+      "mirostat_tau": 5.0,
+      "mirostat_eta": 0.10000000149011612,
+      "max_tokens": -1,
+      "n_keep": 0,
+      "n_discard": 0,
+      "ignore_eos": false,
+      "stream": true,
+      "n_probs": 0,
+      "min_keep": 0,
+      "chat_format": "GPT-OSS",
+      "reasoning_format": "none",
+      "reasoning_in_content": false,
+      "thinking_forced_open": false,
+      "samplers": [
+        "penalties",
+        "dry",
+        "top_k",
+        "typ_p",
+        "top_p",
+        "min_p",
+        "xtc",
+        "temperature"
+      ],
+      "speculative.n_max": 16,
+      "speculative.n_min": 0,
+      "speculative.p_min": 0.75,
+      "timings_per_token": false,
+      "post_sampling_probs": false,
+      "lora": []
+    },
+    "next_token": {
+      "has_next_token": true,
+      "has_new_line": true,
+      "n_remain": -1,
+      "n_decoded": 136
+    }
+  }
+]
+```
+
+</details>
+
+### GET `/metrics`: Prometheus compatible metrics exporter
+
+This endpoint is only accessible if `--metrics` is set.
+
+Available metrics:
+- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
+- `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
+- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
+- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
+- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. `1` means 100 percent usage.
+- `llamacpp:kv_cache_tokens`: KV-cache tokens.
+- `llamacpp:requests_processing`: Number of requests processing.
+- `llamacpp:requests_deferred`: Number of requests deferred.
+- `llamacpp:n_tokens_max`: High watermark of the context size observed.
+
+### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
+
+*Options:*
+
+`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
+
+**Response format**
+
+```json
+{
+    "id_slot": 0,
+    "filename": "slot_save_file.bin",
+    "n_saved": 1745,
+    "n_written": 14309796,
+    "timings": {
+        "save_ms": 49.865
+    }
+}
+```
+
+### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
+
+*Options:*
+
+`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
+
+**Response format**
+
+```json
+{
+    "id_slot": 0,
+    "filename": "slot_save_file.bin",
+    "n_restored": 1745,
+    "n_read": 14309796,
+    "timings": {
+        "restore_ms": 42.937
+    }
+}
+```
+
+### POST `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
+
+**Response format**
+
+```json
+{
+    "id_slot": 0,
+    "n_erased": 1745
+}
+```
+
+### GET `/lora-adapters`: Get list of all LoRA adapters
+
+This endpoint returns the loaded LoRA adapters. You can add adapters using `--lora` when starting the server, for example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...`
+
+By default, all adapters will be loaded with scale set to 1. To initialize all adapters scale to 0, add `--lora-init-without-apply`
+
+Please note that this value will be overwritten by the `lora` field for each request.
+
+If an adapter is disabled, the scale will be set to 0.
+
+**Response format**
+
+```json
+[
+    {
+        "id": 0,
+        "path": "my_adapter_1.gguf",
+        "scale": 0.0
+    },
+    {
+        "id": 1,
+        "path": "my_adapter_2.gguf",
+        "scale": 0.0
+    }
+]
+```
+
+### POST `/lora-adapters`: Set list of LoRA adapters
+
+This sets the global scale for LoRA adapters. Please note that this value will be overwritten by the `lora` field for each request.
+
+To disable an adapter, either remove it from the list below, or set scale to 0.
+
+**Request format**
+
+To know the `id` of the adapter, use GET `/lora-adapters`
+
+```json
+[
+  {"id": 0, "scale": 0.2},
+  {"id": 1, "scale": 0.8}
+]
+```
+
+## OpenAI-compatible API Endpoints
+
+### GET `/v1/models`: OpenAI-compatible Model Info API
+
+Returns information about the loaded model. See [OpenAI Models API documentation](https://platform.openai.com/docs/api-reference/models).
+
+The returned list always has one single element. The `meta` field can be `null` (for example, while the model is still loading).
+
+By default, model `id` field is the path to model file, specified via `-m`. You can set a custom value for model `id` field via `--alias` argument. For example, `--alias gpt-4o-mini`.
+
+Example:
+
+```json
+{
+    "object": "list",
+    "data": [
+        {
+            "id": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
+            "object": "model",
+            "created": 1735142223,
+            "owned_by": "llamacpp",
+            "meta": {
+                "vocab_type": 2,
+                "n_vocab": 128256,
+                "n_ctx_train": 131072,
+                "n_embd": 4096,
+                "n_params": 8030261312,
+                "size": 4912898304
+            }
+        }
+    ]
+}
+```
+
+### POST `/v1/completions`: OpenAI-compatible Completions API
+
+Given an input `prompt`, it returns the predicted completion. Streaming mode is also supported. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps.
+
+*Options:*
+
+See [OpenAI Completions API documentation](https://platform.openai.com/docs/api-reference/completions).
+
+llama.cpp `/completion`-specific features such as `mirostat` are supported.
+
+*Examples:*
+
+Example usage with `openai` python library:
+
+```python
+import openai
+
+client = openai.OpenAI(
+    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+    api_key = "sk-no-key-required"
+)
+
+completion = client.completions.create(
+  model="davinci-002",
+  prompt="I believe the meaning of life is",
+  max_tokens=8
+)
+
+print(completion.choices[0].text)
+```
+
+### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
+
+Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
+
+If model supports multimodal, you can input the media file via `image_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.
+
+*Options:*
+
+See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). llama.cpp `/completion`-specific features such as `mirostat` are also supported.
+
+The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
+
+`chat_template_kwargs`: Allows sending additional parameters to the json templating system. For example: `{"enable_thinking": false}`
+
+`reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.
+
+`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models.
+
+`parse_tool_calls`: Whether to parse the generated tool call.
+
+`parallel_tool_calls` : Whether to enable parallel/multiple tool calls (only supported on some models, verification is based on jinja template).
+
+*Examples:*
+
+You can use either Python `openai` library with appropriate checkpoints:
+
+```python
+import openai
+
+client = openai.OpenAI(
+    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+    api_key = "sk-no-key-required"
+)
+
+completion = client.chat.completions.create(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
+    {"role": "user", "content": "Write a limerick about python exceptions"}
+  ]
+)
+
+print(completion.choices[0].message)
+```
+
+... or raw HTTP requests:
+
+```shell
+curl http://localhost:8080/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer no-key" \
+-d '{
+"model": "gpt-3.5-turbo",
+"messages": [
+{
+    "role": "system",
+    "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
+},
+{
+    "role": "user",
+    "content": "Write a limerick about python exceptions"
+}
+]
+}'
+```
+
+*Tool call support*
+
+[OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) is supported with the `--jinja` flag (and may require a `--chat-template-file` override to get the right tool-use compatible Jinja template; worst case, `--chat-template chatml` may also work).
+
+**See our [Function calling](../../docs/function-calling.md) docs** for more details, supported native tool call styles (generic tool call style is used as fallback) / examples of use.
+
+*Timings and context usage*
+
+The response contains a `timings` object, for example:
+
+```js
+{
+  "choices": [],
+  "created": 1757141666,
+  "id": "chatcmpl-ecQULm0WqPrftUqjPZO1CFYeDjGZNbDu",
+  // ...
+  "timings": {
+    "cache_n": 236, // number of prompt tokens reused from cache
+    "prompt_n": 1, // number of prompt tokens being processed
+    "prompt_ms": 30.958,
+    "prompt_per_token_ms": 30.958,
+    "prompt_per_second": 32.301828283480845,
+    "predicted_n": 35, // number of predicted tokens
+    "predicted_ms": 661.064,
+    "predicted_per_token_ms": 18.887542857142858,
+    "predicted_per_second": 52.94494935437416
+  }
+}
+```
+
+This provides information on the performance of the server. It also allows calculating the current context usage.
+
+The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n`
+
+*Reasoning support*
+
+The server supports parsing and returning reasoning via the `reasoning_content` field, similar to Deepseek API.
+
+Reasoning input (preserve reasoning in history) is also supported by some specific templates. For more details, please refer to [PR#18994](https://github.com/ggml-org/llama.cpp/pull/18994).
+
+### POST `/v1/responses`: OpenAI-compatible Responses API
+
+*Options:*
+
+See [OpenAI Responses API documentation](https://platform.openai.com/docs/api-reference/responses).
+
+*Examples:*
+
+You can use either Python `openai` library with appropriate checkpoints:
+
+```python
+import openai
+
+client = openai.OpenAI(
+    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+    api_key = "sk-no-key-required"
+)
+
+response = client.responses.create(
+  model="gpt-4.1",
+  instructions="You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
+  input="Write a limerick about python exceptions"
+)
+
+print(response.output_text)
+```
+
+... or raw HTTP requests:
+
+```shell
+curl http://localhost:8080/v1/responses \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer no-key" \
+-d '{
+"model": "gpt-4.1",
+"instructions": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
+"input": "Write a limerick about python exceptions"
+}'
+```
+
+This endpoint works by converting Responses request into Chat Completions request.
+
+
+### POST `/v1/embeddings`: OpenAI-compatible embeddings API
+
+This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
+
+*Options:*
+
+See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
+
+*Examples:*
+
+- input as string
+
+  ```shell
+  curl http://localhost:8080/v1/embeddings \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer no-key" \
+  -d '{
+          "input": "hello",
+          "model":"GPT-4",
+          "encoding_format": "float"
+  }'
+  ```
+
+- `input` as string array
+
+  ```shell
+  curl http://localhost:8080/v1/embeddings \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer no-key" \
+  -d '{
+          "input": ["hello", "world"],
+          "model":"GPT-4",
+          "encoding_format": "float"
+  }'
+  ```
+
+### POST `/v1/messages`: Anthropic-compatible Messages API
+
+Given a list of `messages`, returns the assistant's response. Streaming is supported via Server-Sent Events. While no strong claims of compatibility with the Anthropic API spec are made, in our experience it suffices to support many apps.
+
+*Options:*
+
+See [Anthropic Messages API documentation](https://docs.anthropic.com/en/api/messages). Tool use requires `--jinja` flag.
+
+`model`: Model identifier (required)
+
+`messages`: Array of message objects with `role` and `content` (required)
+
+`max_tokens`: Maximum tokens to generate (default: 4096)
+
+`system`: System prompt as string or array of content blocks
+
+`temperature`: Sampling temperature 0-1 (default: 1.0)
+
+`top_p`: Nucleus sampling (default: 1.0)
+
+`top_k`: Top-k sampling
+
+`stop_sequences`: Array of stop sequences
+
+`stream`: Enable streaming (default: false)
+
+`tools`: Array of tool definitions (requires `--jinja`)
+
+`tool_choice`: Tool selection mode (`{"type": "auto"}`, `{"type": "any"}`, or `{"type": "tool", "name": "..."}`)
+
+*Examples:*
+
+```shell
+curl http://localhost:8080/v1/messages \
+  -H "Content-Type: application/json" \
+  -H "x-api-key: your-api-key" \
+  -d '{
+    "model": "gpt-4",
+    "max_tokens": 1024,
+    "system": "You are a helpful assistant.",
+    "messages": [
+      {"role": "user", "content": "Hello!"}
+    ]
+  }'
+```
+
+### POST `/v1/messages/count_tokens`: Token Counting
+
+Counts the number of tokens in a request without generating a response.
+
+Accepts the same parameters as `/v1/messages`. The `max_tokens` parameter is not required.
+
+*Example:*
+
+```shell
+curl http://localhost:8080/v1/messages/count_tokens \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [
+      {"role": "user", "content": "Hello!"}
+    ]
+  }'
+```
+
+*Response:*
+
+```json
+{"input_tokens": 10}
+```
+
+## Using multiple models
+
+`llama-server` can be launched in a **router mode** that exposes an API for dynamically loading and unloading models. The main process (the "router") automatically forwards each request to the appropriate model instance.
+
+To start in router mode, launch `llama-server` **without specifying any model**:
+
+```sh
+llama-server
+```
+
+### Model sources
+
+There are 3 possible sources for model files:
+1. Cached models (controlled by the `LLAMA_CACHE` environment variable)
+2. Custom model directory (set via the `--models-dir` argument)
+3. Custom preset (set via the `--models-preset` argument)
+
+By default, the router looks for models in the cache. You can add Hugging Face models to the cache with:
+
+```sh
+llama-server -hf <user>/<model>:<tag>
+```
+
+*The server must be restarted after adding a new model.*
+
+Alternatively, you can point the router to a local directory containing your GGUF files using `--models-dir`. Example command:
+
+```sh
+llama-server --models-dir ./models_directory
+```
+
+If the model contains multiple GGUF (for multimodal or multi-shard), files should be put into a subdirectory. The directory structure should look like this:
+
+```sh
+models_directory
+ │
+ │  # single file
+ ├─ llama-3.2-1b-Q4_K_M.gguf
+ ├─ Qwen3-8B-Q4_K_M.gguf
+ │
+ │  # multimodal
+ ├─ gemma-3-4b-it-Q8_0
+ │    ├─ gemma-3-4b-it-Q8_0.gguf
+ │    └─ mmproj-F16.gguf   # file name must start with "mmproj"
+ │
+ │  # multi-shard
+ ├─ Kimi-K2-Thinking-UD-IQ1_S
+ │    ├─ Kimi-K2-Thinking-UD-IQ1_S-00001-of-00006.gguf
+ │    ├─ Kimi-K2-Thinking-UD-IQ1_S-00002-of-00006.gguf
+ │    ├─ ...
+ │    └─ Kimi-K2-Thinking-UD-IQ1_S-00006-of-00006.gguf
+```
+
+You may also specify default arguments that will be passed to every model instance:
+
+```sh
+llama-server -ctx 8192 -n 1024 -np 2
+```
+
+Note: model instances inherit both command line arguments and environment variables from the router server.
+
+Alternatively, you can also add GGUF based preset (see next section)
+
+### Model presets
+
+Model presets allow advanced users to define custom configurations using an `.ini` file:
+
+```sh
+llama-server --models-preset ./my-models.ini
+```
+
+Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layers 123` is written as `n-gpu-layers = 123`.
+
+Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys.
+
+Example:
+
+```ini
+version = 1
+
+; (Optional) This section provides global settings shared across all presets.
+; If the same key is defined in a specific preset, it will override the value in this global section.
+[*]
+c = 8192
+n-gpu-layer = 8
+
+; If the key corresponds to an existing model on the server,
+; this will be used as the default config for that model
+[ggml-org/MY-MODEL-GGUF:Q8_0]
+; string value
+chat-template = chatml
+; numeric value
+n-gpu-layers = 123
+; flag value (for certain flags, you need to use the "no-" prefix for negation)
+jinja = true
+; shorthand argument (for example, context size)
+c = 4096
+; environment variable name
+LLAMA_ARG_CACHE_RAM = 0
+; file paths are relative to server's CWD
+model-draft = ./my-models/draft.gguf
+; but it's RECOMMENDED to use absolute path
+model-draft = /Users/abc/my-models/draft.gguf
+
+; If the key does NOT correspond to an existing model,
+; you need to specify at least the model path or HF repo
+[custom_model]
+model = /Users/abc/my-awesome-model-Q4_K_M.gguf
+```
+
+Note: some arguments are controlled by router (e.g., host, port, API key, HF repo, model alias). They will be removed or overwritten upon loading.
+
+The precedence rule for preset options is as follows:
+1. **Command-line arguments** passed to `llama-server` (highest priority)
+2. **Model-specific options** defined in the preset file (e.g. `[ggml-org/MY-MODEL...]`)
+3. **Global options** defined in the preset file (`[*]`)
+
+We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
+- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
+- `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10)
+
+### Routing requests
+
+Requests are routed according to the requested model name.
+
+For **POST** endpoints (`/v1/chat/completions`, `/v1/completions`, `/infill`, etc.) The router uses the `"model"` field in the JSON body:
+
+```json
+{
+  "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
+  "messages": [
+    {
+      "role": "user",
+      "content": "hello"
+    }
+  ]
+}
+```
+
+For **GET** endpoints (`/props`, `/metrics`, etc.) The router uses the `model` query parameter (URL-encoded):
+
+```
+GET /props?model=ggml-org%2Fgemma-3-4b-it-GGUF%3AQ4_K_M
+```
+
+By default, the model will be loaded automatically if it's not loaded. To disable this, add `--no-models-autoload` when starting the server. Additionally, you can include `?autoload=true|false` in the query param to control this behavior per-request.
+
+### GET `/models`: List available models
+
+Listing all models in cache. The model metadata will also include a field to indicate the status of the model:
+
+```json
+{
+  "data": [{
+    "id": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
+    "in_cache": true,
+    "path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf",
+    "status": {
+      "value": "loaded",
+      "args": ["llama-server", "-ctx", "4096"]
+    },
+    ...
+  }]
+}
+```
+
+Note: For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
+
+The `status` object can be:
+
+```json
+"status": {
+  "value": "unloaded"
+}
+```
+
+```json
+"status": {
+  "value": "loading",
+  "args": ["llama-server", "-ctx", "4096"]
+}
+```
+
+```json
+"status": {
+  "value": "unloaded",
+  "args": ["llama-server", "-ctx", "4096"],
+  "failed": true,
+  "exit_code": 1
+}
+```
+
+```json
+"status": {
+  "value": "loaded",
+  "args": ["llama-server", "-ctx", "4096"]
+}
+```
+
+### POST `/models/load`: Load a model
+
+Load a model
+
+Payload:
+- `model`: name of the model to be loaded.
+
+```json
+{
+  "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
+}
+```
+
+Response:
+
+```json
+{
+  "success": true
+}
+```
+
+
+### POST `/models/unload`: Unload a model
+
+Unload a model
+
+Payload:
+
+```json
+{
+  "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
+}
+```
+
+Response:
+
+```json
+{
+  "success": true
+}
+```
+
+## API errors
+
+`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
+
+Example of an error:
+
+```json
+{
+    "error": {
+        "code": 401,
+        "message": "Invalid API Key",
+        "type": "authentication_error"
+    }
+}
+```
+
+## Sleeping on Idle
+
+The server supports an automatic sleep mode that activates after a specified period of inactivity (no incoming tasks). This feature, introduced in [PR #18228](https://github.com/ggml-org/llama.cpp/pull/18228), can be enabled using the `--sleep-idle-seconds` command-line argument. It works seamlessly in both single-model and multi-model configurations.
+
+When the server enters sleep mode, the model and its associated memory (including the KV cache) are unloaded from RAM to conserve resources. Any new incoming task will automatically trigger the model to reload.
+
+The sleeping status can be retrieved from the `GET /props` endpoint (or `/props?model=(model_name)` in router mode).
+
+Note that the following endpoints are exempt from being considered as incoming tasks. They do not trigger model reloading and do not reset the idle timer:
+- `GET /health`
+- `GET /props`
+- `GET /models`
+
+## More examples
+
+### Interactive mode
+
+Check the sample in [chat.mjs](chat.mjs).
+Run with NodeJS version 16 or later:
+
+```sh
+node chat.mjs
+```
+
+Another sample in [chat.sh](chat.sh).
+Requires [bash](https://www.gnu.org/software/bash/), [curl](https://curl.se) and [jq](https://jqlang.github.io/jq/).
+Run with bash:
+
+```sh
+bash chat.sh
+```
+
+Apart from error types supported by OAI, we also have custom types that are specific to functionalities of llama.cpp:
+
+**When /metrics or /slots endpoint is disabled**
+
+```json
+{
+    "error": {
+        "code": 501,
+        "message": "This server does not support metrics endpoint.",
+        "type": "not_supported_error"
+    }
+}
+```
+
+**When the server receives invalid grammar via */completions endpoint**
+
+```json
+{
+    "error": {
+        "code": 400,
+        "message": "Failed to parse grammar",
+        "type": "invalid_request_error"
+    }
+}
+```
+
+### Legacy completion web UI
+
+A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggml-org/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./tools/server/public_legacy`
+
+For example:
+
+```sh
+./llama-server -m my_model.gguf -c 8192 --path ./tools/server/public_legacy
+```
+
+### Extending or building alternative Web Front End
+
+You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
+
+Read the documentation in `/completion.js` to see convenient ways to access llama.
+
+A simple example is below:
+
+```html
+<html>
+  <body>
+    <pre>
+      <script type="module">
+        import { llama } from '/completion.js'
+
+        const prompt = `### Instruction:
+Write dad jokes, each one paragraph.
+You can use html formatting if needed.
+
+### Response:`
+
+        for await (const chunk of llama(prompt)) {
+          document.write(chunk.data.content)
+        }
+      </script>
+    </pre>
+  </body>
+</html>
+```
diff --git a/llama.cpp/tools/server/bench/README.md b/llama.cpp/tools/server/bench/README.md
new file mode 100644
index 0000000..9549795
--- /dev/null
+++ b/llama.cpp/tools/server/bench/README.md
@@ -0,0 +1,119 @@
+### Server benchmark tools
+
+Benchmark is using [k6](https://k6.io/).
+
+##### Install k6 and sse extension
+
+SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
+
+Example (assuming golang >= 1.21 is installed):
+```shell
+go install go.k6.io/xk6/cmd/xk6@latest
+$GOPATH/bin/xk6 build master \
+--with github.com/phymbert/xk6-sse
+```
+
+#### Download a dataset
+
+This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md).
+
+```shell
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+#### Download a model
+Example for PHI-2
+
+```shell
+../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf
+```
+
+#### Start the server
+The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`.
+
+Example:
+```shell
+llama-server --host localhost --port 8080 \
+  --model ggml-model-q4_0.gguf \
+  --cont-batching \
+  --metrics \
+  --parallel 8 \
+  --batch-size 512 \
+  --ctx-size 4096 \
+  -ngl 33
+```
+
+#### Run the benchmark
+
+For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
+```shell
+./k6 run script.js --duration 10m --iterations 500 --vus 8
+```
+
+The benchmark values can be overridden with:
+- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
+- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
+- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
+- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512`
+- `SERVER_BENCH_DATASET` path to the benchmark dataset file
+- `SERVER_BENCH_MAX_PROMPT_TOKENS` maximum prompt tokens to filter out in the dataset: default `1024`
+- `SERVER_BENCH_MAX_CONTEXT` maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens, default `2048`
+
+Note: the local tokenizer is just a string space split, real number of tokens will differ.
+
+Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
+
+```shell
+SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
+```
+
+To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`.
+
+#### Metrics
+
+Following metrics are available computed from the OAI chat completions response `usage`:
+- `llamacpp_tokens_second` Trend of `usage.total_tokens / request duration`
+- `llamacpp_prompt_tokens` Trend of `usage.prompt_tokens`
+- `llamacpp_prompt_tokens_total_counter` Counter of `usage.prompt_tokens`
+- `llamacpp_completion_tokens` Trend of `usage.completion_tokens`
+- `llamacpp_completion_tokens_total_counter` Counter of `usage.completion_tokens`
+- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
+- `llamacpp_completions_stop_rate` Rate of completions stopped by the model, i.e. if `finish_reason === 'stop'`
+
+The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
+
+K6 metrics might be compared against [server metrics](../README.md), with:
+
+```shell
+curl http://localhost:8080/metrics
+```
+
+### Using the CI python script
+The `bench.py` script does several steps:
+- start the server
+- define good variable for k6
+- run k6 script
+- extract metrics from prometheus
+
+It aims to be used in the CI, but you can run it manually:
+
+```shell
+LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \
+              --runner-label local \
+              --name local \
+              --branch `git rev-parse --abbrev-ref HEAD` \
+              --commit `git rev-parse HEAD` \
+              --scenario script.js \
+              --duration 5m \
+              --hf-repo ggml-org/models	 \
+              --hf-file phi-2/ggml-model-q4_0.gguf \
+              --model-path-prefix models \
+              --parallel 4 \
+              -ngl 33 \
+              --batch-size 2048 \
+              --ubatch-size	256 \
+              --ctx-size 4096 \
+              --n-prompts 200 \
+              --max-prompt-tokens 256 \
+              --max-tokens 256
+```
diff --git a/llama.cpp/tools/server/bench/bench.py b/llama.cpp/tools/server/bench/bench.py
new file mode 100644
index 0000000..0c57a2d
--- /dev/null
+++ b/llama.cpp/tools/server/bench/bench.py
@@ -0,0 +1,322 @@
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import signal
+import socket
+import subprocess
+import sys
+import threading
+import time
+import traceback
+from contextlib import closing
+from datetime import datetime
+
+import matplotlib
+import matplotlib.dates
+import matplotlib.pyplot as plt
+import requests
+from statistics import mean
+
+
+def main(args_in: list[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(description="Start server benchmark scenario")
+    parser.add_argument("--name", type=str, help="Bench name", required=True)
+    parser.add_argument("--runner-label", type=str, help="Runner label", required=True)
+    parser.add_argument("--branch", type=str, help="Branch name", default="detached")
+    parser.add_argument("--commit", type=str, help="Commit name", default="dirty")
+    parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, help="Server listen host", default="8080")
+    parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models")
+    parser.add_argument("--n-prompts", type=int,
+                        help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True)
+    parser.add_argument("--max-prompt-tokens", type=int,
+                        help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset",
+                        required=True)
+    parser.add_argument("--max-tokens", type=int,
+                        help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens",
+                        required=True)
+    parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
+    parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
+    parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
+    parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
+    parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
+    parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True)
+    parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
+    parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
+    parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
+
+    args = parser.parse_args(args_in)
+
+    start_time = time.time()
+
+    # Start the server and performance scenario
+    try:
+        server_process = start_server(args)
+    except Exception:
+        print("bench: server start error :")
+        traceback.print_exc(file=sys.stdout)
+        sys.exit(1)
+
+    # start the benchmark
+    iterations = 0
+    data = {}
+    try:
+        start_benchmark(args)
+
+        with open("results.github.env", 'w') as github_env:
+            # parse output
+            with open('k6-results.json', 'r') as bench_results:
+                # Load JSON data from file
+                data = json.load(bench_results)
+                for metric_name in data['metrics']:
+                    for metric_metric in data['metrics'][metric_name]:
+                        value = data['metrics'][metric_name][metric_metric]
+                        if isinstance(value, float) or isinstance(value, int):
+                            value = round(value, 2)
+                            data['metrics'][metric_name][metric_metric]=value
+                            github_env.write(
+                                f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
+                iterations = data['root_group']['checks']['success completion']['passes']
+
+    except Exception:
+        print("bench: error :")
+        traceback.print_exc(file=sys.stdout)
+
+    # Stop the server
+    if server_process:
+        try:
+            print(f"bench: shutting down server pid={server_process.pid} ...")
+            if os.name == 'nt':
+                interrupt = signal.CTRL_C_EVENT
+            else:
+                interrupt = signal.SIGINT
+            server_process.send_signal(interrupt)
+            server_process.wait(0.5)
+
+        except subprocess.TimeoutExpired:
+            print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...")
+            server_process.kill()  # SIGKILL
+            server_process.wait()
+
+        while is_server_listening(args.host, args.port):
+            time.sleep(0.1)
+
+    title = (f"llama.cpp {args.name} on {args.runner_label}\n "
+             f"duration={args.duration} {iterations} iterations")
+    xlabel = (f"{args.hf_repo}/{args.hf_file}\n"
+              f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
+              f"branch={args.branch} commit={args.commit}")
+
+    # Prometheus
+    end_time = time.time()
+    prometheus_metrics = {}
+    if is_server_listening("0.0.0.0", 9090):
+        metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
+                   'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
+
+        for metric in metrics:
+            resp = requests.get(f"http://localhost:9090/api/v1/query_range",
+                                params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
+
+            with open(f"{metric}.json", 'w') as metric_json:
+                metric_json.write(resp.text)
+
+            if resp.status_code != 200:
+                print(f"bench: unable to extract prometheus metric {metric}: {resp.text}")
+            else:
+                metric_data = resp.json()
+                values = metric_data['data']['result'][0]['values']
+                timestamps, metric_values = zip(*values)
+                metric_values = [float(value) for value in metric_values]
+                prometheus_metrics[metric] = metric_values
+                timestamps_dt = [str(datetime.fromtimestamp(int(ts))) for ts in timestamps]
+                plt.figure(figsize=(16, 10), dpi=80)
+                plt.plot(timestamps_dt, metric_values, label=metric)
+                plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
+                plt.yticks(fontsize=12, alpha=.7)
+
+                ylabel = f"llamacpp:{metric}"
+                plt.title(title,
+                          fontsize=14, wrap=True)
+                plt.grid(axis='both', alpha=.3)
+                plt.ylabel(ylabel, fontsize=22)
+                plt.xlabel(xlabel, fontsize=14, wrap=True)
+                plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator())
+                plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S"))
+                plt.gcf().autofmt_xdate()
+
+                # Remove borders
+                plt.gca().spines["top"].set_alpha(0.0)
+                plt.gca().spines["bottom"].set_alpha(0.3)
+                plt.gca().spines["right"].set_alpha(0.0)
+                plt.gca().spines["left"].set_alpha(0.3)
+
+                # Save the plot as a jpg image
+                plt.savefig(f'{metric}.jpg', dpi=60)
+                plt.close()
+
+                # Mermaid format in case images upload failed
+                with open(f"{metric}.mermaid", 'w') as mermaid_f:
+                    mermaid = (
+                    f"""---
+config:
+    xyChart:
+        titleFontSize: 12
+        width: 900
+        height: 600
+    themeVariables:
+        xyChart:
+            titleColor: "#000000"
+---
+xychart-beta
+    title "{title}"
+    y-axis "llamacpp:{metric}"
+    x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
+    line [{', '.join([str(round(float(value), 2)) for value in metric_values])}]
+                    """)
+                    mermaid_f.write(mermaid)
+
+    # 140 chars max for commit status description
+    bench_results = {
+        "i": iterations,
+        "req": {
+            "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
+            "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
+        },
+        "pp": {
+            "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
+            "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
+            "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0,
+        },
+        "tg": {
+            "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
+            "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
+            "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0,
+        },
+    }
+    with open("results.github.env", 'a') as github_env:
+        github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n")
+        github_env.write(f"BENCH_ITERATIONS={iterations}\n")
+
+        title = title.replace('\n', ' ')
+        xlabel = xlabel.replace('\n', ' ')
+        github_env.write(f"BENCH_GRAPH_TITLE={title}\n")
+        github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n")
+
+
+def start_benchmark(args):
+    k6_path = './k6'
+    if 'BENCH_K6_BIN_PATH' in os.environ:
+        k6_path = os.environ['BENCH_K6_BIN_PATH']
+    k6_args = [
+        'run', args.scenario,
+        '--no-color',
+        '--no-connection-reuse',
+        '--no-vu-connection-reuse',
+    ]
+    k6_args.extend(['--duration', args.duration])
+    k6_args.extend(['--iterations', args.n_prompts])
+    k6_args.extend(['--vus', args.parallel])
+    k6_args.extend(['--summary-export', 'k6-results.json'])
+    k6_args.extend(['--out', 'csv=k6-results.csv'])
+    args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
+    args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
+    print(f"bench: starting k6 with: {args}")
+    k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr)
+    if k6_completed.returncode != 0:
+        raise Exception("bench: unable to run k6")
+
+
+def start_server(args):
+    server_process = start_server_background(args)
+
+    attempts = 0
+    max_attempts = 600
+    if 'GITHUB_ACTIONS' in os.environ:
+        max_attempts *= 2
+
+    while not is_server_listening(args.host, args.port):
+        attempts += 1
+        if attempts > max_attempts:
+            assert False, "server not started"
+        print(f"bench:     waiting for server to start ...")
+        time.sleep(0.5)
+
+    attempts = 0
+    while not is_server_ready(args.host, args.port):
+        attempts += 1
+        if attempts > max_attempts:
+            assert False, "server not ready"
+        print(f"bench:     waiting for server to be ready ...")
+        time.sleep(0.5)
+
+    print("bench: server started and ready.")
+    return server_process
+
+
+def start_server_background(args):
+    # Start the server
+    server_path = '../../../build/bin/llama-server'
+    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
+        server_path = os.environ['LLAMA_SERVER_BIN_PATH']
+    server_args = [
+        '--host', args.host,
+        '--port', args.port,
+    ]
+    server_args.extend(['--hf-repo', args.hf_repo])
+    server_args.extend(['--hf-file', args.hf_file])
+    server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
+    server_args.extend(['--ctx-size', args.ctx_size])
+    server_args.extend(['--parallel', args.parallel])
+    server_args.extend(['--batch-size', args.batch_size])
+    server_args.extend(['--ubatch-size', args.ubatch_size])
+    server_args.extend(['--n-predict', args.max_tokens * 2])
+    server_args.append('--cont-batching')
+    server_args.append('--metrics')
+    server_args.append('--flash-attn')
+    args = [str(arg) for arg in [server_path, *server_args]]
+    print(f"bench: starting server with: {' '.join(args)}")
+    pkwargs = {
+        'stdout': subprocess.PIPE,
+        'stderr': subprocess.PIPE
+    }
+    server_process = subprocess.Popen(
+        args,
+        **pkwargs)  # pyright: ignore[reportArgumentType, reportCallIssue]
+
+    def server_log(in_stream, out_stream):
+        for line in iter(in_stream.readline, b''):
+            print(line.decode('utf-8'), end='', file=out_stream)
+
+    thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout))
+    thread_stdout.start()
+    thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr))
+    thread_stderr.start()
+
+    return server_process
+
+
+def is_server_listening(server_fqdn, server_port):
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        result = sock.connect_ex((server_fqdn, server_port))
+        _is_server_listening = result == 0
+        if _is_server_listening:
+            print(f"server is listening on {server_fqdn}:{server_port}...")
+        return _is_server_listening
+
+
+def is_server_ready(server_fqdn, server_port):
+    url = f"http://{server_fqdn}:{server_port}/health"
+    response = requests.get(url)
+    return response.status_code == 200
+
+
+def escape_metric_name(metric_name):
+    return re.sub('[^A-Z0-9]', '_', metric_name.upper())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/llama.cpp/tools/server/bench/prometheus.yml b/llama.cpp/tools/server/bench/prometheus.yml
new file mode 100644
index 0000000..b15ee52
--- /dev/null
+++ b/llama.cpp/tools/server/bench/prometheus.yml
@@ -0,0 +1,9 @@
+global:
+  scrape_interval:     10s
+  external_labels:
+    llamacpp: 'server'
+
+scrape_configs:
+  - job_name: 'llama.cpp server'
+    static_configs:
+      - targets: ['localhost:8080']
diff --git a/llama.cpp/tools/server/bench/requirements.txt b/llama.cpp/tools/server/bench/requirements.txt
new file mode 100644
index 0000000..66ed226
--- /dev/null
+++ b/llama.cpp/tools/server/bench/requirements.txt
@@ -0,0 +1,2 @@
+matplotlib
+requests
diff --git a/llama.cpp/tools/server/bench/script.js b/llama.cpp/tools/server/bench/script.js
new file mode 100644
index 0000000..2772bee
--- /dev/null
+++ b/llama.cpp/tools/server/bench/script.js
@@ -0,0 +1,162 @@
+import sse from 'k6/x/sse'
+import {check, sleep} from 'k6'
+import {SharedArray} from 'k6/data'
+import {Counter, Rate, Trend} from 'k6/metrics'
+import exec from 'k6/execution';
+
+// Server chat completions prefix
+const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
+
+// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
+const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
+
+// Model name to request
+const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
+
+// Dataset path
+const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
+
+// Max tokens to predict
+const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
+
+// Max prompt tokens
+const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024
+
+// Max slot context
+const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048
+
+export function setup() {
+    console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
+}
+
+const data = new SharedArray('conversations', function () {
+    const tokenizer = (message) => message.split(/[\s,'".?]/)
+
+    return JSON.parse(open(dataset_path))
+        // Filter out the conversations with less than 2 turns.
+        .filter(data => data["conversations"].length >= 2)
+        .filter(data => data["conversations"][0]["from"] === "human")
+        .map(data => {
+            return {
+                prompt: data["conversations"][0]["value"],
+                n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length,
+                n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length,
+            }
+        })
+        // Filter out too short sequences
+        .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
+        // Filter out too long sequences.
+        .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot)
+        // Keep only first n prompts
+        .slice(0, n_prompt)
+})
+
+const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
+const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
+
+const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
+const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
+const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
+
+const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
+const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
+
+const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
+const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
+
+export const options = {
+    thresholds: {
+        llamacpp_completions_truncated_rate: [
+            // more than 80% of truncated input will abort the test
+            {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
+        ],
+    },
+    duration: '10m',
+    vus: 8,
+}
+
+export default function () {
+    const conversation = data[exec.scenario.iterationInInstance % data.length]
+    const payload = {
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are ChatGPT, an AI assistant.",
+            },
+            {
+                "role": "user",
+                "content": conversation.prompt,
+            }
+        ],
+        "model": model,
+        "stream": true,
+        "stream_options": {
+          "include_usage": true, // False to be supported in llama.cpp server
+        },
+        "seed": 42,
+        "max_tokens": max_tokens,
+        "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
+    }
+
+    const params = {method: 'POST', body: JSON.stringify(payload)};
+
+    const startTime = new Date()
+    let promptEvalEndTime = null
+    let prompt_tokens = 0
+    let completions_tokens = 0
+    let finish_reason = null
+    const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
+        client.on('event', function (event) {
+            if (promptEvalEndTime == null) {
+                promptEvalEndTime = new Date()
+                llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
+            }
+
+            if (event.data === '[DONE]' || event.data === '') {
+                return
+            }
+
+            let chunk = JSON.parse(event.data)
+
+            if (chunk.choices && chunk.choices.length > 0) {
+                let choice = chunk.choices[0]
+                if (choice.finish_reason) {
+                    finish_reason = choice.finish_reason
+                }
+            }
+
+            if (chunk.usage) {
+                prompt_tokens = chunk.usage.prompt_tokens
+                llamacpp_prompt_tokens.add(prompt_tokens)
+                llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
+
+                completions_tokens = chunk.usage.completion_tokens
+                llamacpp_completion_tokens.add(completions_tokens)
+                llamacpp_completion_tokens_total_counter.add(completions_tokens)
+            }
+        })
+
+        client.on('error', function (e) {
+            console.log('An unexpected error occurred: ', e.error());
+            throw e;
+        })
+    })
+
+    check(res, {'success completion': (r) => r.status === 200})
+
+    const endTime = new Date()
+
+    const promptEvalTime = promptEvalEndTime - startTime
+    if (promptEvalTime > 0) {
+        llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
+    }
+
+    const completion_time = endTime - promptEvalEndTime
+    if (completions_tokens > 0 && completion_time > 0) {
+        llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
+    }
+    llamacpp_completions_truncated_rate.add(finish_reason === 'length')
+    llamacpp_completions_stop_rate.add(finish_reason === 'stop')
+
+    sleep(0.3)
+}
diff --git a/llama.cpp/tools/server/chat-llama2.sh b/llama.cpp/tools/server/chat-llama2.sh
new file mode 100755
index 0000000..450445f
--- /dev/null
+++ b/llama.cpp/tools/server/chat-llama2.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+
+API_URL="${API_URL:-http://127.0.0.1:8080}"
+
+CHAT=(
+    "Hello, Assistant."
+    "Hello. How may I help you today?"
+)
+
+INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+
+trim() {
+    shopt -s extglob
+    set -- "${1##+([[:space:]])}"
+    printf "%s" "${1%%+([[:space:]])}"
+}
+
+trim_trailing() {
+    shopt -s extglob
+    printf "%s" "${1%%+([[:space:]])}"
+}
+
+format_prompt() {
+    if [[ "${#CHAT[@]}" -eq 0 ]]; then
+        echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
+    else
+        LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
+        echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
+    fi
+}
+
+tokenize() {
+    curl \
+        --silent \
+        --request POST \
+        --url "${API_URL}/tokenize" \
+        --header "Content-Type: application/json" \
+        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
+    | jq '.tokens[]'
+}
+
+N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
+
+chat_completion() {
+    PROMPT="$(trim_trailing "$(format_prompt "$1")")"
+    DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
+        prompt: .,
+        temperature: 0.2,
+        top_k: 40,
+        top_p: 0.9,
+        n_keep: $n_keep,
+        n_predict: 1024,
+        stop: ["[INST]"],
+        stream: true
+    }')"
+
+    # Create a temporary file to hold the Python output
+    TEMPFILE=$(mktemp)
+
+    exec 3< <(curl \
+        --silent \
+        --no-buffer \
+        --request POST \
+        --url "${API_URL}/completion" \
+        --header "Content-Type: application/json" \
+        --data-raw "${DATA}")
+
+    python -c "
+import json
+import sys
+
+answer = ''
+while True:
+    line = sys.stdin.readline()
+    if not line:
+        break
+    if line.startswith('data: '):
+        json_content = line[6:].strip()
+        content = json.loads(json_content)['content']
+        sys.stdout.write(content)
+        sys.stdout.flush()
+        answer += content
+
+answer = answer.rstrip('\n')
+
+# Write the answer to the temporary file
+with open('$TEMPFILE', 'w') as f:
+    f.write(answer)
+    " <&3
+
+    exec 3<&-
+
+    # Read the answer from the temporary file
+    ANSWER=$(cat $TEMPFILE)
+
+    # Clean up the temporary file
+    rm $TEMPFILE
+
+    printf "\n"
+
+    CHAT+=("$1" "$(trim "$ANSWER")")
+}
+
+while true; do
+    echo -en "\033[0;32m"  # Green color
+    read -r -e -p "> " QUESTION
+    echo -en "\033[0m"  # Reset color
+    chat_completion "${QUESTION}"
+done
diff --git a/llama.cpp/tools/server/chat.mjs b/llama.cpp/tools/server/chat.mjs
new file mode 100644
index 0000000..4fef565
--- /dev/null
+++ b/llama.cpp/tools/server/chat.mjs
@@ -0,0 +1,131 @@
+import * as readline from 'node:readline'
+import { stdin, stdout } from 'node:process'
+import { readFileSync } from 'node:fs'
+import { SchemaConverter }  from './public_legacy/json-schema-to-grammar.mjs'
+
+const args = process.argv.slice(2);
+const grammarJsonSchemaFile = args.find(
+    (_, index) => args[index - 1] === "--grammar-json-schema"
+);
+
+const no_cached_prompt = args.find(
+    (_, index) => args[index - 1] === "--no-cache-prompt"
+) ?? "false";
+
+const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
+
+// Example usage: function,arguments
+const grammarJsonSchemaPropOrder = args.find(
+    (_, index) => args[index - 1] === "--grammar-json-schema-prop-order"
+);
+const propOrder = grammarJsonSchemaPropOrder
+    ? grammarJsonSchemaPropOrder
+          .split(",")
+          .reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {})
+    : {};
+
+let grammar = null
+if (grammarJsonSchemaFile) {
+    let schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
+    const converter = new SchemaConverter({prop_order: propOrder, allow_fetch: true})
+    schema = await converter.resolveRefs(schema, grammarJsonSchemaFile)
+    converter.visit(schema, '')
+    grammar = converter.formatGrammar()
+}
+if (grammarFile) {
+    grammar = readFileSync(grammarFile, 'utf-8')
+}
+
+// for cached prompt
+let slot_id = -1;
+
+const API_URL = 'http://127.0.0.1:8080'
+
+const chat = [
+    {
+        human: "Hello, Assistant.",
+        assistant: "Hello. How may I help you today?"
+    },
+    {
+        human: "Please tell me the largest city in Europe.",
+        assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia."
+    },
+]
+
+const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.`
+
+function format_prompt(question) {
+    return `${instruction}\n${
+        chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n")
+    }\n### Human: ${question}\n### Assistant:`
+}
+
+async function tokenize(content) {
+    const result = await fetch(`${API_URL}/tokenize`, {
+        method: 'POST',
+        body: JSON.stringify({ content })
+    })
+
+    if (!result.ok) {
+        return []
+    }
+
+    return await result.json().tokens
+}
+
+const n_keep = await tokenize(instruction).length
+
+async function chat_completion(question) {
+    const result = await fetch(`${API_URL}/completion`, {
+        method: 'POST',
+        body: JSON.stringify({
+            prompt: format_prompt(question),
+            temperature: 0.2,
+            top_k: 40,
+            top_p: 0.9,
+            n_keep: n_keep,
+            n_predict: 256,
+            cache_prompt: no_cached_prompt === "false",
+            slot_id: slot_id,
+            stop: ["\n### Human:"], // stop completion after generating this
+            grammar,
+            stream: true,
+        })
+    })
+
+    if (!result.ok) {
+        return
+    }
+
+    let answer = ''
+
+    for await (var chunk of result.body) {
+        const t = Buffer.from(chunk).toString('utf8')
+        if (t.startsWith('data: ')) {
+            const message = JSON.parse(t.substring(6))
+            slot_id = message.slot_id
+            answer += message.content
+            process.stdout.write(message.content)
+            if (message.stop) {
+                if (message.truncated) {
+                    chat.shift()
+                }
+                break
+            }
+        }
+    }
+
+    process.stdout.write('\n')
+    chat.push({ human: question, assistant: answer.trimStart() })
+}
+
+const rl = readline.createInterface({ input: stdin, output: stdout });
+
+const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => {
+    rl.question(query, options, resolve)
+});
+
+while(true) {
+    const question = await readlineQuestion(rl, '> ')
+    await chat_completion(question)
+}
diff --git a/llama.cpp/tools/server/chat.sh b/llama.cpp/tools/server/chat.sh
new file mode 100755
index 0000000..84cea2d
--- /dev/null
+++ b/llama.cpp/tools/server/chat.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+API_URL="${API_URL:-http://127.0.0.1:8080}"
+
+CHAT=(
+    "Hello, Assistant."
+    "Hello. How may I help you today?"
+    "Please tell me the largest city in Europe."
+    "Sure. The largest city in Europe is Moscow, the capital of Russia."
+)
+
+INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+
+trim() {
+    shopt -s extglob
+    set -- "${1##+([[:space:]])}"
+    printf "%s" "${1%%+([[:space:]])}"
+}
+
+trim_trailing() {
+    shopt -s extglob
+    printf "%s" "${1%%+([[:space:]])}"
+}
+
+format_prompt() {
+    echo -n "${INSTRUCTION}"
+    printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1"
+}
+
+tokenize() {
+    curl \
+        --silent \
+        --request POST \
+        --url "${API_URL}/tokenize" \
+        --header "Content-Type: application/json" \
+        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
+    | jq '.tokens[]'
+}
+
+N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l)
+
+chat_completion() {
+    PROMPT="$(trim_trailing "$(format_prompt "$1")")"
+    DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
+        prompt: .,
+        temperature: 0.2,
+        top_k: 40,
+        top_p: 0.9,
+        n_keep: $n_keep,
+        n_predict: 256,
+        cache_prompt: true,
+        stop: ["\n### Human:"],
+        stream: true
+    }')"
+
+    ANSWER=''
+
+    while IFS= read -r LINE; do
+        if [[ $LINE = data:* ]]; then
+            CONTENT="$(echo "${LINE:5}" | jq -r '.content')"
+            printf "%s" "${CONTENT}"
+            ANSWER+="${CONTENT}"
+        fi
+    done < <(curl \
+        --silent \
+        --no-buffer \
+        --request POST \
+        --url "${API_URL}/completion" \
+        --header "Content-Type: application/json" \
+        --data-raw "${DATA}")
+
+    printf "\n"
+
+    CHAT+=("$1" "$(trim "$ANSWER")")
+}
+
+while true; do
+    read -r -e -p "> " QUESTION
+    chat_completion "${QUESTION}"
+done
diff --git a/llama.cpp/tools/server/public/index.html.gz b/llama.cpp/tools/server/public/index.html.gz
new file mode 100644
index 0000000..e3b06f4
Binary files /dev/null and b/llama.cpp/tools/server/public/index.html.gz differ
diff --git a/llama.cpp/tools/server/public/loading.html b/llama.cpp/tools/server/public/loading.html
new file mode 100644
index 0000000..c3fd19a
--- /dev/null
+++ b/llama.cpp/tools/server/public/loading.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta http-equiv="refresh" content="5">
+    </head>
+    <body>
+        <div id="loading">
+            The model is loading. Please wait.<br/>
+            The user interface will appear soon.
+        </div>
+    </body>
+</html>
diff --git a/llama.cpp/tools/server/public_legacy/colorthemes.css b/llama.cpp/tools/server/public_legacy/colorthemes.css
new file mode 100755
index 0000000..b1e2b8b
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/colorthemes.css
@@ -0,0 +1,402 @@
+@import url("theme-snowstorm.css");
+@import url("theme-polarnight.css");
+@import url("theme-ketivah.css");
+@import url("theme-mangotango.css");
+@import url("theme-playground.css");
+@import url("theme-beeninorder.css");
+
+:root {
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(217.5, 26.7%, 94.1%);
+    --primary-color-1-hue:             217.5;
+    --primary-color-1-saturation:      26.7%;
+    --primary-color-1-lightness:       94.1%;
+
+--primary-color-2: hsl(218.2, 26.8%, 92.0%);
+    --primary-color-2-hue:             218.2;
+    --primary-color-2-saturation:      26.8%;
+    --primary-color-2-lightness:       92.0%;
+
+--primary-color-3: hsl(218.8, 27.9%, 88.0%);
+    --primary-color-3-hue:             218.8;
+    --primary-color-3-saturation:      27.9%;
+    --primary-color-3-lightness:       88.0%;
+
+--primary-color-4: hsl(218.8, 18.3%, 81.8%);
+    --primary-color-4-hue:             218.8;
+    --primary-color-4-saturation:      18.3%;
+    --primary-color-4-lightness:       81.8%;
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(220.0, 16.4%, 21.6%);
+    --secondary-color-1-hue:             220.0;
+    --secondary-color-1-saturation:      16.4%;
+    --secondary-color-1-lightness:       21.6%;
+
+--secondary-color-2: hsl(221.7, 16.3%, 27.6%);
+    --secondary-color-2-hue:             221.7;
+    --secondary-color-2-saturation:      16.3%;
+    --secondary-color-2-lightness:       27.6%;
+
+--secondary-color-3: hsl(220.0, 16.8%, 31.6%);
+    --secondary-color-3-hue:             220.0;
+    --secondary-color-3-saturation:      16.8%;
+    --secondary-color-3-lightness:       31.6%;
+
+--secondary-color-4: hsl(220.0, 16.5%, 35.7%);
+    --secondary-color-4-hue:             220.0;
+    --secondary-color-4-saturation:      16.5%;
+    --secondary-color-4-lightness:       35.7%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
+    --theme-nuance-color-1-hue:             178.7;
+    --theme-nuance-color-1-saturation:      25.1%;
+    --theme-nuance-color-1-lightness:       64.9%;
+
+--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
+    --theme-nuance-color-2-hue:             193.3;
+    --theme-nuance-color-2-saturation:      43.4%;
+    --theme-nuance-color-2-lightness:       67.5%;
+
+--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
+    --theme-nuance-color-3-hue:             210.0;
+    --theme-nuance-color-3-saturation:      34.0%;
+    --theme-nuance-color-3-lightness:       63.1%;
+
+--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
+    --theme-nuance-color-4-hue:             213.1;
+    --theme-nuance-color-4-saturation:      32.0%;
+    --theme-nuance-color-4-lightness:       52.2%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+--theme-red-color:    hsl(32.5, 80%, 50%);
+--theme-orange-color: hsl(32.5, 70%, 45%);
+--theme-yellow-color: hsl(40.0,   0.6%, 73.3%);
+--theme-green-color:  hsl(92.4,  27.8%, 64.7%);
+--theme-purple-color: hsl(311.1, 20.2%, 63.1%);
+
+
+
+/* ------------------------------------------- */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--theme-nuance-color-3);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+--chat-id-color:         var(--theme-nuance-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--primary-color-1);
+--button-alert-color-hover:      var(--theme-orange-color);
+--button-alert-border-hover:     var(--theme-orange-color);
+
+--button-alert-text-active:      var(--primary-color-1);
+--button-alert-color-active:     var(--theme-red-color);
+--button-alert-border-active:    var(--theme-red-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text:   var(--secondary-color-1);
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(217.5,
+    calc(var(--secondary-color-1-saturation) + 35%),
+    calc(var(--secondary-color-1-lightness)  - 30%));
+
+--button-primary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) -  2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) -  2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 35%));
+
+--button-primary-color-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 25%));
+
+--button-primary-border-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 25%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 50%));
+
+--button-secondary-color:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+--button-secondary-border:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 22%),
+    calc(var(--theme-nuance-color-3-lightness)  +  1%));
+
+--button-secondary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 22%),
+    calc(var(--theme-nuance-color-3-lightness)  +  1%));
+
+
+/* ---------active--------- */
+--button-secondary-text-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) + 40%),
+    calc(var(--theme-nuance-color-3-lightness)  - 55%));
+
+--button-secondary-color-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 30%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-secondary-border-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 30%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-tertiary-color:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+--button-tertiary-border:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+/* ---------hover---------- */
+--button-tertiary-text-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-tertiary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+--button-tertiary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+}
+
+/*
+
+.theme-template {
+
+
+    If light theme: should go from bright to darker
+    If dark theme: should go from dark to brighter
+    ideally this should not be anything but steps of
+    gray or slightly variants from it
+
+    --primary-color-1: #2E3440;
+    --primary-color-2: #3B4252;
+    --primary-color-3: #434C5E;
+    --primary-color-4: #4C566A;
+
+
+
+    If light theme: should go from dark to brighter
+    If dark theme: should go from bright to darker
+    ideally this should not be anything but steps of
+    gray or slightly variants from it
+
+    --secondary-color-1: #ECEFF4;
+    --secondary-color-2: #E5E9F0;
+    --secondary-color-3: #D8DEE9;
+    --secondary-color-4: #C8CED9;
+
+
+
+    Choose wisely nuance colors. It is not easy to find
+    4 harmonizing nuance colors. But keep in mind, that
+    only one accent color could work too.
+
+    --theme-nuance-color-1: #8FBCBB;
+    --theme-nuance-color-2: #88C0D0;
+    --theme-nuance-color-3: #81A1C1;
+    --theme-nuance-color-4: #5E81AC;
+
+
+
+    adapt the color red, orange, yellow, green,
+    purple to the 'mood' of your overall design
+    e.g is it low-contrast? vibrant? dynamic? etc
+
+    --theme-red-color:    #BF616A;
+    --theme-orange-color: #D08770;
+    --theme-yellow-color: #EBCB8B;
+    --theme-green-color:  #A3BE8C;
+    --theme-purple-color: #B48EAD;
+
+
+
+NOTE: comment all those line `--- ...` out
+------------------------------------------------
+--background-color-1:
+--background-color-2:
+--background-color-3:
+--background-color-4:
+
+--border-color-1:
+--border-color-2:
+--border-color-3:
+
+--border-focus-color:
+--border-focus-shadow:
+
+--text-color-plain:
+--text-color-subtile-1:
+--text-color-subtile-2:
+
+--code-background-color:
+--code-text-color:
+
+--ui-range-thumb-color:
+--ui-range-thumb-border:
+
+--textarea-border-color:
+
+
+
+-------------------------------------------
+--button-alert-text-hover:
+--button-alert-color-hover:
+--button-alert-border-hover:
+
+--button-alert-text-active:
+--button-alert-color-active:
+--button-alert-border-active:
+
+
+
+----------- PRIMARY -----------------------
+--button should immediately catch the eye--
+
+--button-primary-text:
+--button-primary-color:
+--button-primary-border:
+
+
+---------hover----------
+--button-primary-text-hover:
+--button-primary-color-hover:
+--button-primary-border-hover:
+
+
+---------active---------
+--button-primary-text-active:
+--button-primary-color-active:
+--button-primary-border-active:
+
+
+
+------------ SECONDARY ------------------------
+--button should NOT immediately catch the eye--
+
+--button-secondary-text:
+--button-secondary-color:
+--button-secondary-border:
+
+
+---------hover----------
+--button-secondary-text-hover:
+--button-secondary-color-hover:
+--button-secondary-border-hover:
+
+
+---------active---------
+--button-secondary-text-active:
+--button-secondary-color-active:
+--button-secondary-border-active:
+
+
+
+---------- TERTIARY -----------------------
+---------- disabled buttons ---------------
+--button-tertiary-text:
+--button-tertiary-color:
+--button-tertiary-border:
+
+
+---------hover----------
+--button-tertiary-text:
+--button-tertiary-color:
+--button-tertiary-border:
+
+}
+
+*/
diff --git a/llama.cpp/tools/server/public_legacy/completion.js b/llama.cpp/tools/server/public_legacy/completion.js
new file mode 100644
index 0000000..30df7c2
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/completion.js
@@ -0,0 +1,209 @@
+const paramDefaults = {
+  stream: true,
+  n_predict: 500,
+  temperature: 0.2,
+  stop: ["</s>"]
+};
+
+let generation_settings = null;
+
+
+// Completes the prompt as a generator. Recommended for most use cases.
+//
+// Example:
+//
+//    import { llama } from '/completion.js'
+//
+//    const request = llama("Tell me a joke", {n_predict: 800})
+//    for await (const chunk of request) {
+//      document.write(chunk.data.content)
+//    }
+//
+export async function* llama(prompt, params = {}, config = {}) {
+  let controller = config.controller;
+  const api_url = config.api_url?.replace(/\/+$/, '') || "";
+
+  if (!controller) {
+    controller = new AbortController();
+  }
+
+  const completionParams = { ...paramDefaults, ...params, prompt };
+
+  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
+    method: 'POST',
+    body: JSON.stringify(completionParams),
+    headers: {
+      'Connection': 'keep-alive',
+      'Content-Type': 'application/json',
+      'Accept': 'text/event-stream',
+      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
+    },
+    signal: controller.signal,
+  });
+
+  const reader = response.body.getReader();
+  const decoder = new TextDecoder();
+
+  let content = "";
+  let leftover = ""; // Buffer for partially read lines
+
+  try {
+    let cont = true;
+
+    while (cont) {
+      const result = await reader.read();
+      if (result.done) {
+        break;
+      }
+
+      // Add any leftover data to the current chunk of data
+      const text = leftover + decoder.decode(result.value);
+
+      // Check if the last character is a line break
+      const endsWithLineBreak = text.endsWith('\n');
+
+      // Split the text into lines
+      let lines = text.split('\n');
+
+      // If the text doesn't end with a line break, then the last line is incomplete
+      // Store it in leftover to be added to the next chunk of data
+      if (!endsWithLineBreak) {
+        leftover = lines.pop();
+      } else {
+        leftover = ""; // Reset leftover if we have a line break at the end
+      }
+
+      // Parse all sse events and add them to result
+      const regex = /^(\S+):\s(.*)$/gm;
+      for (const line of lines) {
+        const match = regex.exec(line);
+        if (match) {
+          result[match[1]] = match[2];
+          if (result.data === '[DONE]') {
+            cont = false;
+            break;
+          }
+
+          // since we know this is llama.cpp, let's just decode the json in data
+          if (result.data) {
+            result.data = JSON.parse(result.data);
+            content += result.data.content;
+
+            // yield
+            yield result;
+
+            // if we got a stop token from server, we will break here
+            if (result.data.stop) {
+              if (result.data.generation_settings) {
+                generation_settings = result.data.generation_settings;
+              }
+              cont = false;
+              break;
+            }
+          }
+          if (result.error) {
+            try {
+              result.error = JSON.parse(result.error);
+              if (result.error.message.includes('slot unavailable')) {
+                // Throw an error to be caught by upstream callers
+                throw new Error('slot unavailable');
+              } else {
+                console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
+              }
+            } catch(e) {
+              console.error(`llama.cpp error ${result.error}`)
+            }
+          }
+        }
+      }
+    }
+  } catch (e) {
+    if (e.name !== 'AbortError') {
+      console.error("llama error: ", e);
+    }
+    throw e;
+  }
+  finally {
+    controller.abort();
+  }
+
+  return content;
+}
+
+// Call llama, return an event target that you can subscribe to
+//
+// Example:
+//
+//    import { llamaEventTarget } from '/completion.js'
+//
+//    const conn = llamaEventTarget(prompt)
+//    conn.addEventListener("message", (chunk) => {
+//      document.write(chunk.detail.content)
+//    })
+//
+export const llamaEventTarget = (prompt, params = {}, config = {}) => {
+  const eventTarget = new EventTarget();
+  (async () => {
+    let content = "";
+    for await (const chunk of llama(prompt, params, config)) {
+      if (chunk.data) {
+        content += chunk.data.content;
+        eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
+      }
+      if (chunk.data.generation_settings) {
+        eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
+      }
+      if (chunk.data.timings) {
+        eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
+      }
+    }
+    eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
+  })();
+  return eventTarget;
+}
+
+// Call llama, return a promise that resolves to the completed text. This does not support streaming
+//
+// Example:
+//
+//     llamaPromise(prompt).then((content) => {
+//       document.write(content)
+//     })
+//
+//     or
+//
+//     const content = await llamaPromise(prompt)
+//     document.write(content)
+//
+export const llamaPromise = (prompt, params = {}, config = {}) => {
+  return new Promise(async (resolve, reject) => {
+    let content = "";
+    try {
+      for await (const chunk of llama(prompt, params, config)) {
+        content += chunk.data.content;
+      }
+      resolve(content);
+    } catch (error) {
+      reject(error);
+    }
+  });
+};
+
+/**
+ * (deprecated)
+ */
+export const llamaComplete = async (params, controller, callback) => {
+  for await (const chunk of llama(params.prompt, params, { controller })) {
+    callback(chunk);
+  }
+}
+
+// Get the model info from the server. This is useful for getting the context window and so on.
+export const llamaModelInfo = async (config = {}) => {
+  if (!generation_settings) {
+    const api_url = config.api_url?.replace(/\/+$/, '') || "";
+    const props = await fetch(`${api_url}/props`).then(r => r.json());
+    generation_settings = props.default_generation_settings;
+  }
+  return generation_settings;
+}
diff --git a/llama.cpp/tools/server/public_legacy/favicon.ico b/llama.cpp/tools/server/public_legacy/favicon.ico
new file mode 100644
index 0000000..89e154a
Binary files /dev/null and b/llama.cpp/tools/server/public_legacy/favicon.ico differ
diff --git a/llama.cpp/tools/server/public_legacy/index-new.html b/llama.cpp/tools/server/public_legacy/index-new.html
new file mode 100644
index 0000000..e2f39d6
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/index-new.html
@@ -0,0 +1,1190 @@
+<!DOCTYPE html>
+
+<html>
+
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
+  <title>llama.cpp - chat</title>
+
+  <link rel="icon" type="image/x-icon" href="favicon.ico">
+  <link rel="stylesheet" href="style.css">
+
+  <script type="module">
+    import {
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
+    } from './index.js';
+
+    import { llama } from './completion.js';
+    import { SchemaConverter } from './json-schema-to-grammar.mjs';
+    import { promptFormats } from './prompt-formats.js';
+    import { systemPrompts } from './system-prompts.js'; // multilingual is wip
+    let selected_image = false;
+    var slot_id = -1;
+
+    const session = signal({
+      prompt: "",
+      template: "{{prompt}}\n{{history}}{{char}}",
+      historyTemplate: "{{name}}: {{message}}\n",
+      transcript: [],
+      type: "chat",  // "chat" | "completion"
+      char: "ASSISTANT",
+      user: "USER",
+      image_selected: ''
+    })
+
+    const params = signal({
+      n_predict: 358, // 358 is a nice number
+      temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower
+      repeat_last_n: 0, // 0 = disable penalty, -1 = context size
+      repeat_penalty: 1.0, // 1.0 = disabled
+      dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
+      dry_base: 1.75,     // 0.0 = disabled
+      dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
+      dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
+      top_k: 0, // <= 0 to use vocab size
+      top_p: 1.0, // 1.0 = disabled
+      min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
+      xtc_probability: 0.0, // 0 = disabled;
+      xtc_threshold: 0.1, // > 0.5 disables XTC;
+      typical_p: 1.0, // 1.0 = disabled
+      presence_penalty: 0.0, // 0.0 = disabled
+      frequency_penalty: 0.0, // 0.0 = disabled
+      mirostat: 0, // 0/1/2
+      mirostat_tau: 5, // target entropy
+      mirostat_eta: 0.1, // learning rate
+      grammar: '',
+      n_probs: 0, // no completion_probabilities,
+      min_keep: 0, // min probs from each sampler,
+      image_data: [],
+      cache_prompt: true,
+      api_key: ''
+    })
+
+
+
+    /* START: Support for storing prompt templates and parameters in browser's LocalStorage */
+
+    const local_storage_storageKey = "llamacpp_server_local_storage";
+
+    function local_storage_setDataFromObject(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
+    }
+
+    function local_storage_setDataFromRawText(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
+    }
+
+    function local_storage_getDataAsObject(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return JSON.parse(item);
+      }
+    }
+
+    function local_storage_getDataAsRawText(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return item;
+      }
+    }
+
+    // create a container for user templates and settings
+
+    const savedUserTemplates = signal({})
+    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
+
+    // let's import locally saved templates and settings if there are any
+    // user templates and settings are stored in one object
+    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
+
+    console.log('Importing saved templates')
+
+    let importedTemplates = local_storage_getDataAsObject('user_templates')
+
+    if (importedTemplates) {
+      // saved templates were successfuly imported.
+
+      console.log('Processing saved templates and updating default template')
+      params.value = { ...params.value, image_data: [] };
+
+      //console.log(importedTemplates);
+      savedUserTemplates.value = importedTemplates;
+
+      //override default template
+      savedUserTemplates.value.default = { session: session.value, params: params.value }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    } else {
+      // no saved templates detected.
+
+      console.log('Initializing LocalStorage and saving default template')
+
+      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    }
+
+    function userTemplateResetToDefault() {
+      console.log('Reseting themplate to default')
+      selectedUserTemplate.value.name = 'default';
+      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
+    }
+
+    function userTemplateApply(t) {
+      session.value = t.data.session;
+      session.value = { ...session.value, image_selected: '' };
+      params.value = t.data.params;
+      params.value = { ...params.value, image_data: [] };
+    }
+
+    function userTemplateResetToDefaultAndApply() {
+      userTemplateResetToDefault()
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    function userTemplateLoadAndApplyAutosaved() {
+      // get autosaved last used template
+      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
+
+      if (lastUsedTemplate) {
+
+        console.log('Autosaved template found, restoring')
+
+        selectedUserTemplate.value = lastUsedTemplate
+      }
+      else {
+
+        console.log('No autosaved template found, using default template')
+        // no autosaved last used template was found, so load from default.
+
+        userTemplateResetToDefault()
+      }
+
+      console.log('Applying template')
+      // and update internal data from templates
+
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    //console.log(savedUserTemplates.value)
+    //console.log(selectedUserTemplate.value)
+
+    function userTemplateAutosave() {
+      console.log('Template Autosave...')
+      if (selectedUserTemplate.value.name == 'default') {
+        // we don't want to save over default template, so let's create a new one
+        let newTemplateName = 'UserTemplate-' + Date.now().toString()
+        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
+
+        console.log('Saving as ' + newTemplateName)
+
+        // save in the autosave slot
+        local_storage_setDataFromObject('user_templates_last', newTemplate)
+
+        // and load it back and apply
+        userTemplateLoadAndApplyAutosaved()
+      } else {
+        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
+      }
+    }
+
+    console.log('Checking for autosaved last used template')
+    userTemplateLoadAndApplyAutosaved()
+
+    /* END: Support for storing prompt templates and parameters in browser's LocalStorage */
+
+    const llamaStats = signal(null)
+    const controller = signal(null)
+
+    // currently generating a completion?
+    const generating = computed(() => controller.value != null)
+
+    // has the user started a chat?
+    const chatStarted = computed(() => session.value.transcript.length > 0)
+
+    const transcriptUpdate = (transcript) => {
+      session.value = {
+        ...session.value,
+        transcript
+      }
+    }
+
+    // simple template replace
+    const template = (str, extraSettings) => {
+      let settings = session.value;
+      if (extraSettings) {
+        settings = { ...settings, ...extraSettings };
+      }
+      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
+    }
+
+    async function runLlama(prompt, llamaParams, char) {
+      const currentMessages = [];
+      const history = session.value.transcript;
+      if (controller.value) {
+        throw new Error("already running");
+      }
+      controller.value = new AbortController();
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
+        const data = chunk.data;
+        if (data.stop) {
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          slot_id = data.slot_id;
+          if (selected_image && !data.multimodal) {
+            alert("The server was not compiled for multimodal or the model projector can't be loaded.");            return;
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+        }
+        if (data.timings) {
+          // llamaStats.value = data.timings;
+          llamaStats.value = data;
+        }
+      }
+      controller.value = null;
+    }
+
+    // send message to server
+    const chat = async (msg) => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+    // just in case (e.g. llama2)
+    const suffix = session.value.userMsgSuffix || "";
+    const prefix = session.value.userMsgPrefix || "";
+    const userMsg = prefix + msg + suffix;
+
+      transcriptUpdate([...session.value.transcript, ["{{user}}", userMsg]])
+
+      let prompt = template(session.value.template, {
+        message: msg,
+        history: session.value.transcript.flatMap(
+          ([name, data]) =>
+            template(
+              session.value.historyTemplate,
+              {
+                name,
+                message: Array.isArray(data) ?
+                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
+                  data,
+              }
+            )
+        ).join(''),
+      });
+      if (selected_image) {
+        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
+      }
+      await runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: ["</s>", "<|end|>", "<|eot_id|>", "<|end_of_text|>", "<|im_end|>", "<|EOT|>", "<|END_OF_TURN_TOKEN|>", "<|end_of_turn|>", "<|endoftext|>", template("{{char}}"), template("{{user}}")],
+      }, "{{char}}");
+    }
+
+    const runCompletion = () => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+      const { prompt } = session.value;
+      transcriptUpdate([...session.value.transcript, ["", prompt]]);
+      runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: [],
+      }, "").finally(() => {
+        session.value.prompt = session.value.transcript.map(([_, data]) =>
+          Array.isArray(data) ? data.map(msg => msg.content).join('') : data
+        ).join('');
+        session.value.transcript = [];
+      })
+    }
+
+    const stop = (e) => {
+      e.preventDefault();
+      if (controller.value) {
+        controller.value.abort();
+        controller.value = null;
+      }
+    }
+
+    const reset = (e) => {
+      stop(e);
+      transcriptUpdate([]);
+    }
+
+    const uploadImage = (e) => {
+      e.preventDefault();
+      document.getElementById("fileInput").click();
+      document.getElementById("fileInput").addEventListener("change", function (event) {
+        const selectedFile = event.target.files[0];
+        if (selectedFile) {
+          const reader = new FileReader();
+          reader.onload = function () {
+            const image_data = reader.result;
+            session.value = { ...session.value, image_selected: image_data };
+            params.value = {
+              ...params.value, image_data: [
+                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
+            }
+          };
+          selected_image = true;
+          reader.readAsDataURL(selectedFile);
+        }
+      });
+    }
+
+    function MessageInput() {
+      const message = useSignal("")
+
+      const submit = (e) => {
+        stop(e);
+        chat(message.value);
+        message.value = "";
+      }
+
+      const enterSubmits = (event) => {
+        if (event.which === 13 && !event.shiftKey) {
+          submit(event);
+        }
+      }
+
+      return html`
+      <form onsubmit=${submit}>
+        <div class="chat-input-container">
+          <textarea
+            id="chat-input" placeholder="Say Something ... (Shift + Enter for new line)"
+            class="${generating.value ? 'loading' : null}"
+            oninput=${(e) => message.value = e.target.value}
+            onkeypress=${enterSubmits}
+            rows="2"
+            type="text"
+            value="${message}"
+          ></textarea>
+        </div>
+
+          <div class="right">
+            <button class="button-back" onclick=${reset}>Back</button>
+            <button onclick=${uploadImage}>Upload Image</button>
+            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+            <button type="submit" disabled=${generating.value}>Submit</button>
+          </div>
+        </form>
+      `
+    }
+
+    // the completion view needs some ux improvements
+    function CompletionControls() {
+      const submit = (e) => {
+        stop(e);
+        runCompletion();
+      }
+      return html`
+        <div class="right">
+          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
+          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+          <button onclick=${reset}>Back</button>
+        </div>`;
+    }
+
+    const ChatLog = (props) => {
+      const messages = session.value.transcript;
+      const container = useRef(null)
+
+      useEffect(() => {
+        // scroll to bottom (if needed)
+        const parent = container.current.parentElement;
+        if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) {
+          parent.scrollTo(0, parent.scrollHeight)
+        }
+      }, [messages])
+
+      const isCompletionMode = session.value.type === 'completion'
+      const chatLine = ([user, data], index) => {
+        let message
+        const isArrayMessage = Array.isArray(data)
+        if (params.value.n_probs > 0 && isArrayMessage) {
+          message = html`<${Probabilities} data=${data} />`
+        } else {
+          const text = isArrayMessage ?
+            data.map(msg => msg.content).join('') :
+            data;
+          message = isCompletionMode ?
+            text :
+            html`<${Markdownish} text=${template(text)} />`
+        }
+        if (user) {
+          return html`<p key=${index}><strong class="chat-id-color">${template(user)}</strong> ${message}</p>`
+        } else {
+          return isCompletionMode ?
+            html`<span key=${index}>${message}</span>` :
+            html`<p key=${index}>${message}</p>`
+        }
+      };
+
+      const handleCompletionEdit = (e) => {
+        session.value.prompt = e.target.innerText;
+        session.value.transcript = [];
+      }
+
+      return html`
+        <div id="chat" ref=${container} key=${messages.length}>
+          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
+          <span contenteditable=${isCompletionMode} ref=${container} oninput=${handleCompletionEdit}>
+            ${messages.flatMap(chatLine)}
+          </span>
+        </div>`;
+    };
+
+
+
+///////////// UI Improvements /////////////
+//
+//
+const handleToggleChange = (e) => {
+  const isChecked = e.target.checked;
+  session.value = { ...session.value, type: isChecked ? 'completion' : 'chat' };
+  localStorage.setItem('toggleState', isChecked);
+}
+//
+const loadToggleState = () => {
+  const storedState = localStorage.getItem('toggleState');
+  if (storedState !== null) {
+    const isChecked = storedState === 'true';
+    document.getElementById('toggle').checked = isChecked;
+    session.value = { ...session.value, type: isChecked ? 'completion' : 'chat' };
+  }
+}
+//
+document.addEventListener('DOMContentLoaded', loadToggleState);
+//
+//
+// function to update the prompt format
+function updatePromptFormat(e) {
+  const promptFormat = e.target.value;
+  if (promptFormats.hasOwnProperty(promptFormat)) {
+    session.value = {
+      ...session.value,
+      ...promptFormats[promptFormat]
+    };
+  } else {
+    // Use vicuna as llama.cpp's default setting, since it's most common
+    session.value = {
+      ...session.value,
+      template: "{{prompt}}\n{{history}}{{char}}",
+      historyTemplate: "{{name}}: {{message}}\n",
+      char: "ASSISTANT",
+      user: "USER"
+    };
+  }
+  console.log('Updated session value:', session.value);
+}
+//
+//
+// function to update the prompt format from the selected one
+function updatePromptFormatFromDropdown(element) {
+  const promptFormat = element.getAttribute('data-value');
+  console.log('Selected prompt format:', promptFormat); // debugging
+  updatePromptFormat({ target: { value: promptFormat } });
+}
+//
+//
+// function that adds the event listers as soon as the element is available
+function addEventListenersWhenAvailable() {
+  var themeSelector = document.getElementById('theme-selector');
+  if (themeSelector) {
+    themeSelector.addEventListener('change', function(event) {
+      // event-handler-code...
+    });
+    // placeholder event listeners
+  } else {
+    // if the element is not there yet, wait ahead
+    requestAnimationFrame(addEventListenersWhenAvailable);
+  }
+}
+//
+//
+// begin with the check
+requestAnimationFrame(addEventListenersWhenAvailable);
+//
+//
+// avoid default and create new event object with value from data value attribute
+function handleDropdownSelection(e, promptFormat) {
+  e.preventDefault();
+  const customEvent = {
+    target: {
+      value: promptFormat
+    }
+  };
+  // call our updatePromptFormat-function
+  updatePromptFormat(customEvent);
+}
+//
+//
+// function to update the system message
+function updateSystemPrompt(e) {
+  const SystemPrompt = e.target.value;
+  if (systemPrompts.hasOwnProperty(SystemPrompt)) {
+    session.value = {
+      ...session.value,
+      prompt: systemPrompts[SystemPrompt].systemPrompt
+    };
+  }
+}
+//
+//
+///////////// UI Improvements /////////////
+
+
+
+
+const ConfigForm = (props) => {
+  const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
+  const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
+  const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
+  const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+  const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }
+
+  const grammarJsonSchemaPropOrder = signal('')
+  const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
+  const convertJSONSchemaGrammar = async () => {
+    try {
+      let schema = JSON.parse(params.value.grammar)
+      const converter = new SchemaConverter({
+        prop_order: grammarJsonSchemaPropOrder.value
+          .split(',')
+          .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
+        allow_fetch: true,
+      })
+      schema = await converter.resolveRefs(schema, 'input')
+      converter.visit(schema, '')
+      params.value = {
+        ...params.value,
+        grammar: converter.formatGrammar(),
+      }
+    } catch (e) {
+      alert(`Convert failed: ${e.message}`)
+    }
+  }
+
+  const FloatField = ({ label, title, max, min, name, step, value }) => {
+return html`
+<div>
+  <label for="${name}"><span title="${title}">${label}</span></label>
+  <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} title="${title}" />
+  <span id="${name}-value">${value}</span>
+</div>
+`
+};
+
+const IntField = ({ label, title, max, min, step, name, value }) => {
+return html`
+<div>
+  <label for="${name}"><span title="${title}">${label}</span></label>
+  <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsInt} title="${title}" />
+  <span id="${name}-value">${value}</span>
+</div>
+`
+};
+
+const BoolField = ({ label, title, name, value }) => {
+return html`
+<div>
+  <label for="${name}"><span title="${title}">${label}</span></label>
+  <input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} title="${title}" />
+</div>
+`
+};
+
+  const userTemplateReset = (e) => {
+    e.preventDefault();
+    userTemplateResetToDefaultAndApply()
+  }
+
+  const UserTemplateResetButton = () => {
+    if (selectedUserTemplate.value.name == 'default') {
+      return html`
+      <button class="reset-button" id="id_reset" onclick="${userTemplateReset}">Reset</button>
+      `
+    }
+
+    return html`
+      <div class="button-container">
+        <button class="reset-button" title="Caution: This resets the entire form." onclick="${userTemplateReset}">Reset</button>
+      </div>
+    `
+  };
+
+  useEffect(() => {
+    // autosave template on every change
+    userTemplateAutosave()
+  }, [session.value, params.value])
+
+  const GrammarControl = () => (
+    html`
+      <div>
+        <div class="grammar">
+          <label for="template"></label>
+          <textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+        </div>
+        <div class="grammar-columns">
+          <div class="json-schema-controls">
+            <input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+            <button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+          </div>
+        </div>
+      </div>
+    `
+  );
+
+  const PromptControlFieldSet = () => (
+    html`
+      <fieldset>
+        <div class="input-container">
+          <label for="prompt" class="input-label">System</label>
+          <textarea
+            id="prompt"
+            class="persistent-input"
+            name="prompt"
+            placeholder="[Note] The following models do not support System Prompts by design:\n• OpenChat\n• Orion\n• Phi-3\n• Starling\n• Yi-...-Chat"
+            value="${session.value.prompt}"
+            oninput=${updateSession}
+          ></textarea>
+        </div>
+      </fieldset>
+    `
+  );
+
+  const ChatConfigForm = () => (
+    html`
+      <fieldset class="dropdowns">
+        <div>
+          <select id="promptFormat" name="promptFormat" onchange=${updatePromptFormat}>
+              <option value="default">Prompt Style</option>
+              <option value=""></option>
+            <optgroup label="Common Prompt-Styles">
+              <option value="alpaca">Alpaca</option>
+              <option value="chatml">ChatML</option>
+              <option value="commandr">Command R/+</option>
+              <option value="llama2">Llama 2</option>
+              <option value="llama3">Llama 3</option>
+              <option value="phi3">Phi-3</option>
+              <option value="openchat">OpenChat/Starling</option>
+              <option value="vicuna">Vicuna</option>
+              <option value=""></option>
+            </optgroup>
+            <optgroup label="More Prompt-Styles">
+              <option value="vicuna">Airoboros L2</option>
+              <option value="vicuna">BakLLaVA-1</option>
+              <option value="alpaca">Code Cherry Pop</option>
+              <option value="deepseekCoder">Deepseek Coder</option>
+              <option value="chatml">Dolphin Mistral</option>
+              <option value="chatml">evolvedSeeker 1.3B</option>
+              <option value="vicuna">Goliath 120B</option>
+              <option value="vicuna">Jordan</option>
+              <option value="vicuna">LLaVA</option>
+              <option value="chatml">Leo Hessianai</option>
+              <option value="vicuna">Leo Mistral</option>
+              <option value="vicuna">Marx</option>
+              <option value="med42">Med42</option>
+              <option value="alpaca">MetaMath</option>
+              <option value="llama2">Mistral Instruct</option>
+              <option value="chatml">Mistral 7B OpenOrca</option>
+              <option value="alpaca">MythoMax</option>
+              <option value="neuralchat">Neural Chat</option>
+              <option value="vicuna">Nous Capybara</option>
+              <option value="nousHermes">Nous Hermes</option>
+              <option value="openchatMath">OpenChat Math</option>
+              <option value="chatml">OpenHermes 2.5-Mistral</option>
+              <option value="alpaca">Orca Mini v3</option>
+              <option value="orion">Orion</option>
+              <option value="vicuna">Samantha</option>
+              <option value="chatml">Samantha Mistral</option>
+              <option value="sauerkrautLM">SauerkrautLM</option>
+              <option value="vicuna">Scarlett</option>
+              <option value="starlingCode">Starling Coding</option>
+              <option value="alpaca">Sydney</option>
+              <option value="vicuna">Synthia</option>
+              <option value="vicuna">Tess</option>
+              <option value="yi34b">Yi-6/9/34B-Chat</option>
+              <option value="zephyr">Zephyr</option>
+              <option value=""></option>
+            </optgroup>
+          </select>
+          <select id="SystemPrompt" name="SystemPrompt" onchange=${updateSystemPrompt}>
+            <option value="default">System Prompt</option>
+            <option value="empty">None</option>
+            <option value="airoboros">Airoboros</option>
+            <option value="alpaca">Alpaca</option>
+            <option value="atlas">Atlas</option>
+            <option value="atlas_de">Atlas - DE</option>
+            <option value="cot">Chain of Tought</option>
+            <option value="commandrempty">Command R/+ (empty)</option>
+            <option value="commandrexample">Command R/+ (example)</option>
+            <option value="deduce">Critical Thinking</option>
+            <option value="deepseekcoder">Deepseek Coder</option>
+            <option value="jordan">Jordan</option>
+            <option value="leomistral">Leo Mistral</option>
+            <option value="med42">Med42</option>
+            <option value="migeltot">Migel's Tree of Thought</option>
+            <option value="mistralopenorca">Mistral OpenOrca</option>
+            <option value="orcamini">Orca Mini</option>
+            <option value="samantha">Samantha</option>
+            <option value="sauerkraut">Sauerkraut</option>
+            <option value="scarlett">Scarlett</option>
+            <option value="synthia">Synthia</option>
+            <option value="vicuna">Vicuna</option>
+          </select>
+          <!--<select id="systemLanguage" name="systemLanguage">-->
+            <!--<option value="default">English</option>-->
+            <!--<option value="DE">German</option>-->
+            <!--<option value="placeholderLanguage">Placeholder</option>-->
+          <!--</select>-->
+        </div>
+      </fieldset>
+      ${PromptControlFieldSet()}
+        <fieldset>
+          <details>
+            <summary><span class="summary-title" id="id_prompt-style">Prompt Style</span></summary>
+            <fieldset class="names">
+          <div>
+            <label for="user" id="id_user-name">User ID</label>
+            <input type="text" id="user" name="user" value="${session.value.user}" oninput=${updateSession} />
+          </div>
+          <div>
+            <label for="bot" id="id_bot-name">AI ID</label>
+            <input type="text" id="bot" name="char" value="${session.value.char}" oninput=${updateSession} />
+          </div>
+        </fieldset>
+          <div class="two-columns">
+            <div>
+              <div class="input-container">
+                <label for="template" class="input-label-sec" id_prompt-template>Prompt Template</label>
+                <textarea id="template" class="persistent-input-sec" name="template" value="${session.value.template}" rows=6 oninput=${updateSession}/>
+              </div>
+            </div>
+            <div>
+              <div class="input-container">
+                <label for="template" class="input-label-sec" id="id_history-template">Chat History</label>
+                <textarea id="history-template" class="persistent-input-sec" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
+              </div>
+            </div>
+          </div>
+        </details>
+        <details>
+          <summary><span class="summary-title" id="id_grammar-title" id_grammar-title>Grammar</span></summary>
+          ${GrammarControl()}
+        </details>
+
+        </fieldset>
+    `
+  );
+
+  const CompletionConfigForm = () => (
+    html`
+      ${PromptControlFieldSet()}
+      <fieldset>
+        <details>
+          <summary><span class="summary-title" id="id_grammar-title" id_grammar-title>Grammar</span></summary>
+          ${GrammarControl()}
+        </details>
+      </fieldset>
+    `
+  );
+// todo toggle button et api field et reset button in one nice row
+  return html`
+    <form>
+      <fieldset class="two">
+          <input type="checkbox" id="toggle" class="toggleCheckbox" onchange=${handleToggleChange} />
+            <label for="toggle" class="toggleContainer">
+              <div id="id_toggle-label-chat">Chat</div>
+              <div id="id_toggle-label-complete">Complete</div>
+            </label>
+      <fieldset>
+
+          <input type="text" id="api_key" class="apiKey" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
+      </fieldset>
+
+        <${UserTemplateResetButton}/>
+      </fieldset>
+
+      ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
+
+      <fieldset class="params">
+        ${IntField({ label: "Prediction", title: "Set the maximum number of tokens to predict when generating text. Note: May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. The value -1 means infinity. Default is 358", max: 2048, min: -1, step: 16, name: "n_predict", value: params.value.n_predict, })}
+        ${FloatField({ label: "Min-P sampling", title: "The minimum probability for a token to be considered, relative to the probability of the most likely token. Note that it's good practice to disable all other samplers aside from temperature when using min-p. It is also recommenend to go this approach. Default is 0.05 – But consider higher values like ~ 0.4 for non-English text generation. The value 1.0 means disabled", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
+        ${FloatField({ label: "Repetition Penalty", title: "Control the repetition of token sequences in the generated text. Default is 1.1", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
+        ${FloatField({ label: "Temperature", title: "This will adjust the overall randomness of the generated text. It is the most common sampler. Default is 0.8 but consider using lower values for more factual texts or for non-English text generation", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+      </fieldset>
+
+      <details>
+        <summary><span class="summary-title">Further Options</span></summary>
+        <fieldset class="params">
+          ${IntField({ label: "Top-K", title: "Limits the selection of the next token to the K most probable tokens. 1 means no randomness = greedy sampling. If set to 0, it means the entire vocabulary size is considered.", max: 100, min: 0, step: 1, name: "top_k", value: params.value.top_k })}
+          ${IntField({ label: "Penalize Last N", title: "The last n tokens that are taken into account to penalise repetitions. A value of 0 means that this function is deactivated and -1 means that the entire size of the context is taken into account.", max: 2048, min: 0, step: 16, name: "repeat_last_n", value: params.value.repeat_last_n })}
+          ${FloatField({ label: "Presence Penalty", title: "A penalty that is applied if certain tokens appear repeatedly in the generated text. A higher value leads to fewer repetitions.", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
+          ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+          ${FloatField({ label: "Top-P", title: "Limits the selection of the next token to a subset of tokens whose combined probability reaches a threshold value P = top-P. If set to 1, it means the entire vocabulary size is considered.", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
+          ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+          ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
+          ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
+          ${FloatField({ label: "DRY Penalty Multiplier", title: "Set the DRY repetition penalty multiplier. Default is 0.0, which disables DRY.", max: 5.0, min: 0.0, name: "dry_multiplier", step: 0.01, value: params.value.dry_multiplier })}
+          ${FloatField({ label: "DRY Base", title: "Set the DRY repetition penalty base value. Default is 1.75", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })}
+          ${IntField({ label: "DRY Allowed Length", title: "Tokens that extend repetition beyond this receive exponentially increasing penalty. Default is 2", max: 10, min: 1, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })}
+          ${IntField({ label: "DRY Penalty Last N", title: "How many tokens to scan for repetitions. Default is -1, where 0 is disabled and -1 is context size", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })}
+          ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
+        </fieldset>
+
+        <hr style="height: 1px; background-color: #ececf1; border: none;" />
+
+        <fieldset class="three">
+          <label title="The Mirostat sampling method is an algorithm used in natural language processing to improve the quality and coherence of the generated texts. It is an at-runtime-adaptive method that aims to keep the entropy or surprise of a text within a desired range."><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> Mirostat off</label>
+          <label title="Mirostat version 1 was developed to adjust the probability of predictions so that the surprise in the text remains constant. This means that the algorithm tries to maintain a balance between predictable and surprising words so that the text is neither too monotonous nor too chaotic. V1 is recommended for longer writings, creative texts, etc."><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
+          <label title="Mirostat version 2 builds on the idea of V1 but brings some improvements. V2 is recommended as a general purpose algorithm since it offers more precise control over entropy and reacts more quickly to unwanted deviations. As a result, the generated texts appear even more consistent and coherent, especially for everday life conversations."><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
+          </fieldset>
+        <fieldset class="params">
+          ${FloatField({ label: "Entropy tau", title: "Tau controls the desired level of entropy (or 'surprise') in the text. A low tau (e.g. 0.5) would mean that a text is very predictable, but will also be very coherent. A high tau (e.g. 8.0) would mean that the text is very creative and surprising, but may also be difficult to follow because unlikely words will occur frequently.", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
+          ${FloatField({ label: "Learning-rate eta", title: "Eta determines how quickly the Mirostat algorithm adjusts its predictions to achieve the desired entropy. A learning rate that is too high can cause the algorithm to react too quickly and possibly become unstable, because the algorithm will try to maintain a balance between surprises and precision in the context of only a few words. In this way, 'the common thread' could be lost. Whereas a learning rate that is too low means that the algorithm reacts too slowly and a red thread becomes a heavy goods train that takes a long time to come to a halt and change a 'topic station'.", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
+        </fieldset>
+
+          <hr style="height: 1px; background-color: #ececf1; border: none;" />
+
+          <fieldset class="params">
+            ${IntField({ label: "Show Probabilities", title: "If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. The tokens will be colored in gradient from green to red depending on their probabilities. Note that for temperature 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Defaults to 0", max: 10, min: 0, step: 1, name: "n_probs", value: params.value.n_probs })}
+          </fieldset>
+      </details>
+    </form>
+  `
+}
+
+    // todo - beautify apikey section with css
+
+    const probColor = (p) => {
+      const r = Math.floor(192 * (1 - p));
+      const g = Math.floor(192 * p);
+      return `rgba(${r},${g},0,0.3)`;
+    }
+
+    const Probabilities = (params) => {
+      return params.data.map(msg => {
+        const { completion_probabilities } = msg;
+        if (
+          !completion_probabilities ||
+          completion_probabilities.length === 0
+        ) return msg.content
+
+        if (completion_probabilities.length > 1) {
+          // Not for byte pair
+          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
+
+          const splitData = completion_probabilities.map(prob => ({
+            content: prob.content,
+            completion_probabilities: [prob]
+          }))
+          return html`<${Probabilities} data=${splitData} />`
+        }
+
+        const { probs, content } = completion_probabilities[0]
+        const found = probs.find(p => p.tok_str === msg.content)
+        const pColor = found ? probColor(found.prob) : 'transparent'
+
+        const popoverChildren = html`
+          <div class="prob-set">
+            ${probs.map((p, index) => {
+          return html`
+                <div
+                  key=${index}
+                  title=${`prob: ${p.prob}`}
+                  style=${{
+              padding: '0.3em',
+              backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+            }}
+                >
+                  <span>${p.tok_str}: </span>
+                  <span>${Math.floor(p.prob * 100)}%</span>
+                </div>
+              `
+        })}
+          </div>
+        `
+
+        return html`
+          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
+            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
+          </>
+        `
+      });
+    }
+
+    // poor mans markdown replacement
+    const Markdownish = (params) => {
+      const md = params.text
+        .replace(/&/g, '&amp;')
+        .replace(/</g, '&lt;')
+        .replace(/>/g, '&gt;')
+        .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
+        .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+        .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+        .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+        .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+        .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
+        .replace(/`(.*?)`/g, '<code>$1</code>')
+        .replace(/\n/gim, '<br />');
+      return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
+    };
+
+    const ModelGenerationInfo = (params) => {
+      if (!llamaStats.value) {
+        return html`<span/>`
+      }
+      return html`
+      <span class=generation-statistics>
+          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+        </span>
+      `
+    }
+
+    // simple popover impl
+    const Popover = (props) => {
+      const isOpen = useSignal(false);
+      const position = useSignal({ top: '0px', left: '0px' });
+      const buttonRef = useRef(null);
+      const popoverRef = useRef(null);
+
+      const togglePopover = () => {
+        if (buttonRef.current) {
+          const rect = buttonRef.current.getBoundingClientRect();
+          position.value = {
+            top: `${rect.bottom + window.scrollY}px`,
+            left: `${rect.left + window.scrollX}px`,
+          };
+        }
+        isOpen.value = !isOpen.value;
+      };
+
+      const handleClickOutside = (event) => {
+        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
+          isOpen.value = false;
+        }
+      };
+
+      useEffect(() => {
+        document.addEventListener('mousedown', handleClickOutside);
+        return () => {
+          document.removeEventListener('mousedown', handleClickOutside);
+        };
+      }, []);
+
+      return html`
+        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover} contenteditable="true">${props.children}</span>
+        ${isOpen.value && html`
+          <${Portal} into="#portal">
+            <div
+              ref=${popoverRef}
+              class="popover-content"
+              style=${{
+            top: position.value.top,
+            left: position.value.left,
+          }}
+            >
+              ${props.popoverChildren}
+            </div>
+          </${Portal}>
+        `}
+      `;
+    };
+
+    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
+    /** Redirect rendering of descendants into the given CSS selector */
+    class Portal extends Component {
+      componentDidUpdate(props) {
+        for (let i in props) {
+          if (props[i] !== this.props[i]) {
+            return setTimeout(this.renderLayer);
+          }
+        }
+      }
+
+      componentDidMount() {
+        this.isMounted = true;
+        this.renderLayer = this.renderLayer.bind(this);
+        this.renderLayer();
+      }
+
+      componentWillUnmount() {
+        this.renderLayer(false);
+        this.isMounted = false;
+        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
+      }
+
+      findNode(node) {
+        return typeof node === 'string' ? document.querySelector(node) : node;
+      }
+
+      renderLayer(show = true) {
+        if (!this.isMounted) return;
+
+        // clean up old node if moving bases:
+        if (this.props.into !== this.intoPointer) {
+          this.intoPointer = this.props.into;
+          if (this.into && this.remote) {
+            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
+          }
+          this.into = this.findNode(this.props.into);
+        }
+
+        this.remote = render(html`
+          <${PortalProxy} context=${this.context}>
+            ${show && this.props.children || null}
+          </${PortalProxy}>
+        `, this.into, this.remote);
+      }
+
+      render() {
+        return null;
+      }
+    }
+    // high-order component that renders its first child if it exists.
+    // used as a conditional rendering proxy.
+    class PortalProxy extends Component {
+      getChildContext() {
+        return this.props.context;
+      }
+      render({ children }) {
+        return children || null;
+      }
+    }
+
+    function App(props) {
+      return html`
+        <div class="mode-${session.value.type}">
+          <header>
+            <h2>llama.cpp</h2>
+            <div class="dropdown">
+              <button class="dropbtn"><svg width="16" height="16" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg"><circle cx="12" cy="12" r="10" stroke-width="2"/></svg></button>
+              <div class="dropdown-content" id="theme-selector">
+                <a href="/">Old UI</a>
+                <a href="#" data-theme="default">Snow Storm</a>
+                <a href="#" data-theme="polarnight">Polar Night</a>
+                <a href="#" data-theme="ketivah">Ketivah</a>
+                <a href="#" data-theme="mangotango">Mango Tango</a>
+                <a href="#" data-theme="playground">Playground</a>
+                <a href="#" data-theme="beeninorder">Been In Order</a>
+              </div>
+            </div>
+          </header>
+
+          <main id="content">
+            <${chatStarted.value ? ChatLog : ConfigForm} />
+          </main>
+
+          <section id="write">
+            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
+          </section>
+          <footer>
+            <p><${ModelGenerationInfo} /></p>
+            <p>Powered By <a href="https://github.com/ggml-org/llama.cpp#readme" target="_blank">llama.cpp</a> and <a href="https://ggml.ai/" target="_blank">ggml.ai</a></p>
+          </footer>
+        </div>
+      `;
+    }
+
+  document.addEventListener('DOMContentLoaded', function() {
+  var themeSelector = document.getElementById('theme-selector');
+  var themeLinks = themeSelector.querySelectorAll('a[data-theme]');
+
+  themeLinks.forEach(function(link) {
+    link.addEventListener('click', function(event) {
+      event.preventDefault(); // avoid default behaviour
+      var selectedTheme = event.target.getAttribute('data-theme');
+      changeTheme(selectedTheme);
+    });
+  });
+
+  function changeTheme(theme) {
+    document.body.classList.remove('theme-default', 'theme-polarnight', 'theme-ketivah', 'theme-mangotango', 'theme-playground', 'theme-beeninorder');
+    if (theme !== 'default') {
+      document.body.classList.add('theme-' + theme);
+    }
+    localStorage.setItem('selected-theme', theme);
+  }
+
+  // set the selected theme when loading the page
+  var savedTheme = localStorage.getItem('selected-theme');
+  if (savedTheme && savedTheme !== 'default') {
+    document.body.classList.add('theme-' + savedTheme);
+    // update the dropdown if it still exists
+    var dropdown = document.getElementById('theme-selector-dropdown');
+    if (dropdown) {
+      dropdown.value = savedTheme;
+    }
+  }
+});
+
+
+// snapping of the slider to indicate 'disabled'
+document.addEventListener('DOMContentLoaded', (event) => {
+  // define an object that contains snap values and ranges for each slider
+  const snapSettings = {
+    temperature: { snapValue: 1.0, snapRangeMultiplier: 6 },
+    min_p: { snapValue: 0.05, snapRangeMultiplier: 2 },
+    xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 },
+    top_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
+    typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
+    repeat_penalty: { snapValue: 1.0, snapRangeMultiplier: 4 },
+    presence_penalty: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    frequency_penalty: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    dry_multiplier: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    dry_base: { snapValue: 1.75, snapRangeMultiplier: 4 },
+  };
+  // add an event listener for each slider
+  Object.keys(snapSettings).forEach(sliderName => {
+    const slider = document.querySelector(`input[name="${sliderName}"]`);
+    const settings = snapSettings[sliderName];
+
+    slider.addEventListener('input', (e) => {
+      let value = parseFloat(e.target.value);
+      const step = parseFloat(e.target.step);
+      const snapRange = step * settings.snapRangeMultiplier;
+      const valueDisplay = document.getElementById(`${e.target.name}-value`);
+
+      if (value >= settings.snapValue - snapRange && value <= settings.snapValue + snapRange) {
+        value = settings.snapValue; // set value to the snap value
+        e.target.value = value; // update the slider value
+      }
+      // update the displayed value
+      if (valueDisplay) {
+        valueDisplay.textContent = value.toFixed(2); // display value with two decimal places
+      }
+    });
+  });
+});
+
+    render(h(App), document.querySelector('#container'));
+
+  </script>
+</head>
+
+<body>
+
+  <div id="container">
+    <input type="file" id="fileInput" accept="image/*" style="display: none;">
+  </div>
+  <div id="portal"></div>
+</body>
+
+</html>
diff --git a/llama.cpp/tools/server/public_legacy/index.html b/llama.cpp/tools/server/public_legacy/index.html
new file mode 100644
index 0000000..98d56ea
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/index.html
@@ -0,0 +1,1301 @@
+<html>
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
+  <title>llama.cpp - chat</title>
+
+  <style>
+    body {
+      font-family: system-ui;
+      font-size: 90%;
+    }
+
+    .grid-container {
+      display: grid;
+      grid-template-columns: auto auto auto;
+      padding: 10px;
+    }
+
+    .grid-item {
+      padding: 5px;
+      /* font-size: 30px; */
+      text-align: center;
+    }
+
+    #container {
+      margin: 0em auto;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      height: 100%;
+    }
+
+    main {
+      margin: 3px;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      gap: 1em;
+
+      flex-grow: 1;
+      overflow-y: auto;
+
+      border: 1px solid #ccc;
+      border-radius: 5px;
+      padding: 0.5em;
+    }
+
+    h1 {
+      text-align: center;
+    }
+
+    .customlink:link {
+      color: white;
+      background-color: #007aff;
+      font-weight: 600;
+      text-decoration: none;
+      float: right;
+      margin-top: 30px;
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+      border-radius: 4px;
+      padding: 8px;
+    }
+
+    .customlink:visited {
+      color: white;
+      background-color: #007aff;
+      font-weight: 600;
+      text-decoration: none;
+      float: right;
+      margin-top: 30px;
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+      padding: 8px;
+    }
+
+    .customlink:hover {
+      color: white;
+      background-color: #0070ee;
+      font-weight: 600;
+      text-decoration: none;
+      float: right;
+      margin-top: 30px;
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+      padding: 8px;
+    }
+
+    .customlink:active {
+      color: #0070ee;
+      background-color: #80b3ef;
+      font-weight: 600;
+      text-decoration: none;
+      float: right;
+      margin-top: 30px;
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+      padding: 8px;
+    }
+
+    body {
+      max-width: 600px;
+      min-width: 300px;
+      line-height: 1.2;
+      margin: 0 auto;
+      padding: 0 0.5em;
+    }
+
+    p {
+      overflow-wrap: break-word;
+      word-wrap: break-word;
+      hyphens: auto;
+      margin-top: 0.5em;
+      margin-bottom: 0.5em;
+    }
+
+    #write form {
+      margin: 1em 0 0 0;
+      display: flex;
+      flex-direction: column;
+      gap: 0.5em;
+      align-items: stretch;
+    }
+
+    .message-controls {
+      display: flex;
+      justify-content: flex-end;
+    }
+    .message-controls > div:nth-child(2) {
+      display: flex;
+      flex-direction: column;
+      gap: 0.5em;
+    }
+    .message-controls > div:nth-child(2) > div {
+      display: flex;
+      margin-left: auto;
+      gap: 0.5em;
+    }
+
+    fieldset {
+      border: none;
+      padding: 0;
+      margin: 0;
+    }
+
+    fieldset.two {
+      display: grid;
+      grid-template: "a a";
+      gap: 1em;
+    }
+
+    fieldset.three {
+      display: grid;
+      grid-template: "a a a";
+      gap: 1em;
+    }
+
+    details {
+      border: 1px solid #aaa;
+      border-radius: 4px;
+      padding: 0.5em 0.5em 0;
+      margin-top: 0.5em;
+    }
+
+    summary {
+      font-weight: bold;
+      margin: -0.5em -0.5em 0;
+      padding: 0.5em;
+      cursor: pointer;
+    }
+
+    details[open] {
+      padding: 0.5em;
+    }
+
+    .prob-set {
+      padding: 0.3em;
+      border-bottom: 1px solid #ccc;
+    }
+
+    .popover-content {
+      position: absolute;
+      background-color: white;
+      padding: 0.2em;
+      box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+    }
+
+    textarea {
+      padding: 5px;
+      flex-grow: 1;
+      width: 100%;
+    }
+
+    pre code {
+      display: block;
+      background-color: #222;
+      color: #ddd;
+    }
+
+    code {
+      font-family: monospace;
+      padding: 0.1em 0.3em;
+      border-radius: 3px;
+    }
+
+    fieldset label {
+      margin: 0.5em 0;
+      display: block;
+    }
+
+    fieldset label.slim {
+      margin: 0 0.5em;
+      display: inline;
+    }
+
+    header,
+    footer {
+      text-align: center;
+    }
+
+    footer {
+      font-size: 80%;
+      color: #888;
+    }
+
+    .mode-chat textarea[name=prompt] {
+      height: 4.5em;
+    }
+
+    .mode-completion textarea[name=prompt] {
+      height: 10em;
+    }
+
+    [contenteditable] {
+      display: inline-block;
+      white-space: pre-wrap;
+      outline: 0px solid transparent;
+    }
+
+    @keyframes loading-bg-wipe {
+      0% {
+        background-position: 0%;
+      }
+
+      100% {
+        background-position: 100%;
+      }
+    }
+
+    .loading {
+      --loading-color-1: #eeeeee00;
+      --loading-color-2: #eeeeeeff;
+      background-size: 50% 100%;
+      background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
+      animation: loading-bg-wipe 2s linear infinite;
+    }
+
+    @media (prefers-color-scheme: dark) {
+      .loading {
+        --loading-color-1: #22222200;
+        --loading-color-2: #222222ff;
+      }
+
+      .popover-content {
+        background-color: black;
+      }
+    }
+  </style>
+
+  <script type="module">
+    import {
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
+    } from './index.js';
+
+    import { llama } from './completion.js';
+    import { SchemaConverter } from './json-schema-to-grammar.mjs';
+
+    let selected_image = false;
+    var slot_id = -1;
+
+    const session = signal({
+      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
+      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
+      historyTemplate: "{{name}}: {{message}}",
+      transcript: [],
+      type: "chat",  // "chat" | "completion"
+      char: "Llama",
+      user: "User",
+      image_selected: ''
+    })
+
+    const params = signal({
+      n_predict: 400,
+      temperature: 0.7,
+      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
+      repeat_penalty: 1.18, // 1.0 = disabled
+      dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
+      dry_base: 1.75,     // 0.0 = disabled
+      dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well
+      dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
+      top_k: 40, // <= 0 to use vocab size
+      top_p: 0.95, // 1.0 = disabled
+      min_p: 0.05, // 0 = disabled
+      xtc_probability: 0.0, // 0 = disabled;
+      xtc_threshold: 0.1, // > 0.5 disables XTC;
+      typical_p: 1.0, // 1.0 = disabled
+      presence_penalty: 0.0, // 0.0 = disabled
+      frequency_penalty: 0.0, // 0.0 = disabled
+      mirostat: 0, // 0/1/2
+      mirostat_tau: 5, // target entropy
+      mirostat_eta: 0.1, // learning rate
+      grammar: '',
+      n_probs: 0, // no completion_probabilities,
+      min_keep: 0, // min probs from each sampler,
+      image_data: [],
+      cache_prompt: true,
+      api_key: ''
+    })
+
+    /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
+
+    const local_storage_storageKey = "llamacpp_server_local_storage";
+
+    function local_storage_setDataFromObject(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
+    }
+
+    function local_storage_setDataFromRawText(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
+    }
+
+    function local_storage_getDataAsObject(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return JSON.parse(item);
+      }
+    }
+
+    function local_storage_getDataAsRawText(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return item;
+      }
+    }
+
+    // create a container for user templates and settings
+
+    const savedUserTemplates = signal({})
+    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
+
+    // let's import locally saved templates and settings if there are any
+    // user templates and settings are stored in one object
+    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
+
+    console.log('Importing saved templates')
+
+    let importedTemplates = local_storage_getDataAsObject('user_templates')
+
+    if (importedTemplates) {
+      // saved templates were successfully imported.
+
+      console.log('Processing saved templates and updating default template')
+      params.value = { ...params.value, image_data: [] };
+
+      //console.log(importedTemplates);
+      savedUserTemplates.value = importedTemplates;
+
+      //override default template
+      savedUserTemplates.value.default = { session: session.value, params: params.value }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    } else {
+      // no saved templates detected.
+
+      console.log('Initializing LocalStorage and saving default template')
+
+      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    }
+
+    function userTemplateResetToDefault() {
+      console.log('Resetting template to default')
+      selectedUserTemplate.value.name = 'default';
+      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
+    }
+
+    function userTemplateApply(t) {
+      session.value = t.data.session;
+      session.value = { ...session.value, image_selected: '' };
+      params.value = t.data.params;
+      params.value = { ...params.value, image_data: [] };
+    }
+
+    function userTemplateResetToDefaultAndApply() {
+      userTemplateResetToDefault()
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    function userTemplateLoadAndApplyAutosaved() {
+      // get autosaved last used template
+      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
+
+      if (lastUsedTemplate) {
+
+        console.log('Autosaved template found, restoring')
+
+        selectedUserTemplate.value = lastUsedTemplate
+      }
+      else {
+
+        console.log('No autosaved template found, using default template')
+        // no autosaved last used template was found, so load from default.
+
+        userTemplateResetToDefault()
+      }
+
+      console.log('Applying template')
+      // and update internal data from templates
+
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    //console.log(savedUserTemplates.value)
+    //console.log(selectedUserTemplate.value)
+
+    function userTemplateAutosave() {
+      console.log('Template Autosave...')
+      if (selectedUserTemplate.value.name == 'default') {
+        // we don't want to save over default template, so let's create a new one
+        let newTemplateName = 'UserTemplate-' + Date.now().toString()
+        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
+
+        console.log('Saving as ' + newTemplateName)
+
+        // save in the autosave slot
+        local_storage_setDataFromObject('user_templates_last', newTemplate)
+
+        // and load it back and apply
+        userTemplateLoadAndApplyAutosaved()
+      } else {
+        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
+      }
+    }
+
+    console.log('Checking for autosaved last used template')
+    userTemplateLoadAndApplyAutosaved()
+
+    /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
+
+    const tts = window.speechSynthesis;
+    const ttsVoice = signal(null)
+
+    const llamaStats = signal(null)
+    const controller = signal(null)
+
+    // currently generating a completion?
+    const generating = computed(() => controller.value != null)
+
+    // has the user started a chat?
+    const chatStarted = computed(() => session.value.transcript.length > 0)
+
+    const transcriptUpdate = (transcript) => {
+      session.value = {
+        ...session.value,
+        transcript
+      }
+    }
+
+    // simple template replace
+    const template = (str, extraSettings) => {
+      let settings = session.value;
+      if (extraSettings) {
+        settings = { ...settings, ...extraSettings };
+      }
+      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
+    }
+
+    async function runLlama(prompt, llamaParams, char) {
+      const currentMessages = [];
+      const history = session.value.transcript;
+      if (controller.value) {
+        throw new Error("already running");
+      }
+      controller.value = new AbortController();
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
+        const data = chunk.data;
+
+        if (data.stop) {
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          slot_id = data.slot_id;
+          if (selected_image && !data.multimodal) {
+            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
+            return;
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+        }
+
+        if (data.timings) {
+          llamaStats.value = data;
+        }
+      }
+
+      controller.value = null;
+    }
+
+    // send message to server
+    const chat = async (msg) => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+
+      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
+
+      let prompt = template(session.value.template, {
+        message: msg,
+        history: session.value.transcript.flatMap(
+          ([name, data]) =>
+            template(
+              session.value.historyTemplate,
+              {
+                name,
+                message: Array.isArray(data) ?
+                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
+                  data,
+              }
+            )
+        ).join("\n"),
+      });
+      if (selected_image) {
+        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
+      }
+      await runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
+      }, "{{char}}");
+    }
+
+    const runCompletion = () => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+      const { prompt } = session.value;
+      transcriptUpdate([...session.value.transcript, ["", prompt]]);
+      runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: [],
+      }, "").finally(() => {
+        session.value.prompt = session.value.transcript.map(([_, data]) =>
+          Array.isArray(data) ? data.map(msg => msg.content).join('') : data
+        ).join('');
+        session.value.transcript = [];
+      })
+    }
+
+    const stop = (e) => {
+      e.preventDefault();
+      if (controller.value) {
+        controller.value.abort();
+        controller.value = null;
+      }
+    }
+
+    const reset = (e) => {
+      stop(e);
+      transcriptUpdate([]);
+    }
+
+    const uploadImage = (e) => {
+      e.preventDefault();
+      document.getElementById("fileInput").click();
+      document.getElementById("fileInput").addEventListener("change", function (event) {
+        const selectedFile = event.target.files[0];
+        if (selectedFile) {
+          const reader = new FileReader();
+          reader.onload = function () {
+            const image_data = reader.result;
+            session.value = { ...session.value, image_selected: image_data };
+            params.value = {
+              ...params.value, image_data: [
+                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
+            }
+          };
+          selected_image = true;
+          reader.readAsDataURL(selectedFile);
+        }
+      });
+    }
+
+    const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+    const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null;
+    function MessageInput() {
+      const message = useSignal("");
+
+      const talkActive = useSignal(false);
+      const sendOnTalk = useSignal(false);
+      const talkStop = (e) => {
+        if (e) e.preventDefault();
+
+        talkActive.value = false;
+        talkRecognition?.stop();
+      }
+      const talk = (e) => {
+        e.preventDefault();
+
+        if (talkRecognition)
+          talkRecognition.start();
+        else
+          alert("Speech recognition is not supported by this browser.");
+      }
+      if(talkRecognition) {
+        talkRecognition.onstart = () => {
+          talkActive.value = true;
+        }
+        talkRecognition.onresult = (e) => {
+          if (event.results.length > 0) {
+            message.value = event.results[0][0].transcript;
+            if (sendOnTalk.value) {
+              submit(e);
+            }
+          }
+        }
+        talkRecognition.onspeechend = () => {
+          talkStop();
+        }
+      }
+
+      const ttsVoices = useSignal(tts?.getVoices() || []);
+      const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default));
+      if (tts) {
+        tts.onvoiceschanged = () => {
+          ttsVoices.value = tts.getVoices();
+        }
+      }
+
+      const submit = (e) => {
+        stop(e);
+        chat(message.value);
+        message.value = "";
+      }
+
+      const enterSubmits = (event) => {
+        if (event.which === 13 && !event.shiftKey) {
+          submit(event);
+        }
+      }
+
+      return html`
+        <form onsubmit=${submit}>
+          <div>
+            <textarea
+               className=${generating.value ? "loading" : null}
+               oninput=${(e) => message.value = e.target.value}
+               onkeypress=${enterSubmits}
+               placeholder="Say something..."
+               rows=2
+               type="text"
+               value="${message}"
+            />
+          </div>
+          <div class="message-controls">
+            <div> </div>
+            <div>
+              <div>
+                <button type="submit" disabled=${generating.value || talkActive.value}>Send</button>
+                <button disabled=${generating.value || talkActive.value} onclick=${uploadImage}>Upload Image</button>
+                <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+                <button onclick=${reset}>Reset</button>
+              </div>
+              <div>
+                <a href="#" style="cursor: help;" title="Help" onclick=${e => {
+                  e.preventDefault();
+                  alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` +
+                  `(TTS and speech recognition are not provided by llama.cpp)\n` +
+                  `Note: STT requires HTTPS to work.`);
+                }}>[?]</a>
+                <button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button>
+                <div>
+                  <input type="checkbox" id="send-on-talk" name="send-on-talk" checked="${sendOnTalk}" onchange=${(e) => sendOnTalk.value = e.target.checked} />
+                  <label for="send-on-talk" style="line-height: initial;">Send after talking</label>
+                </div>
+              </div>
+              <div>
+                <a href="#" style="cursor: help;" title="Help" onclick=${e => {
+                  e.preventDefault();
+                  alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`);
+                }}>[?]</a>
+                <label for="tts-voices" style="line-height: initial;">Bot Voice:</label>
+                <select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;">
+                  <option value="" selected="${!ttsVoice.value}">None</option>
+                  ${[
+                    ...(ttsVoiceDefault.value ? [ttsVoiceDefault.value] : []),
+                    ...ttsVoices.value.filter(v => !v.default),
+                  ].map(
+                    v => html`<option value="${v.name}" selected="${ttsVoice.value === v.name}">${v.name} (${v.lang}) ${v.default ? '(default)' : ''}</option>`
+                  )}
+                </select>
+              </div>
+            </div>
+          </div>
+        </form>
+      `
+    }
+
+    function CompletionControls() {
+      const submit = (e) => {
+        stop(e);
+        runCompletion();
+      }
+      return html`
+        <div>
+          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
+          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+          <button onclick=${reset}>Reset</button>
+        </div>`;
+    }
+
+    const ChatLog = (props) => {
+      const messages = session.value.transcript;
+      const container = useRef(null)
+
+      useEffect(() => {
+        // scroll to bottom (if needed)
+        const parent = container.current.parentElement;
+        if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) {
+          parent.scrollTo(0, parent.scrollHeight)
+        }
+      }, [messages])
+
+      const ttsChatLineActiveIx = useSignal(undefined);
+      const ttsChatLine = (e, ix, msg) => {
+        if (e) e.preventDefault();
+
+        if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return;
+
+        const ttsVoices = tts.getVoices();
+        const voice = ttsVoices.find(v => v.name === ttsVoice.value);
+        if (!voice) return;
+
+        if (ttsChatLineActiveIx.value !== undefined) {
+          tts.cancel();
+          if (ttsChatLineActiveIx.value === ix) {
+            ttsChatLineActiveIx.value = undefined;
+            return;
+          }
+        }
+
+        ttsChatLineActiveIx.value = ix;
+        let ttsUtter = new SpeechSynthesisUtterance(msg);
+        ttsUtter.voice = voice;
+        ttsUtter.onend = e => {
+          ttsChatLineActiveIx.value = undefined;
+        };
+        tts.speak(ttsUtter);
+      }
+
+      const isCompletionMode = session.value.type === 'completion'
+
+      // Try play the last bot message
+      const lastCharChatLinesIxs = useSignal([]);
+      const lastCharChatLinesIxsOld = useSignal([]);
+      useEffect(() => {
+        if (
+          !isCompletionMode
+          && lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length
+          && !generating.value
+        ) {
+          const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1];
+          if (ix !== undefined) {
+            const msg = messages[ix];
+            ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg);
+          }
+
+          lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value);
+        }
+      }, [generating.value]);
+
+      const chatLine = ([user, data], index) => {
+        let message
+        const isArrayMessage = Array.isArray(data);
+        const text = isArrayMessage ?
+            data.map(msg => msg.content).join('') :
+            data;
+        if (params.value.n_probs > 0 && isArrayMessage) {
+          message = html`<${Probabilities} data=${data} />`
+        } else {
+          message = isCompletionMode ?
+            text :
+            html`<${Markdownish} text=${template(text)} />`
+        }
+
+        const fromBot = user && user === '{{char}}';
+        if (fromBot && !lastCharChatLinesIxs.value.includes(index))
+          lastCharChatLinesIxs.value.push(index);
+
+        if (user) {
+          return html`
+          <div>
+            <p key=${index}><strong>${template(user)}:</strong> ${message}</p>
+            ${
+              fromBot && ttsVoice.value
+              && html`<button disabled=${generating.value} onclick=${e => ttsChatLine(e, index, text)} aria-label=${ttsChatLineActiveIx.value === index ? 'Pause' : 'Play'}>${ ttsChatLineActiveIx.value === index ? '⏸️' : '▶️' }</div>`
+            }
+          </div>
+          `;
+        } else {
+          return isCompletionMode ?
+            html`<span key=${index}>${message}</span>` :
+            html`<div><p key=${index}>${message}</p></div>`
+        }
+      };
+
+      const handleCompletionEdit = (e) => {
+        session.value.prompt = e.target.innerText;
+        session.value.transcript = [];
+      }
+
+      return html`
+        <div id="chat" ref=${container} key=${messages.length}>
+          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
+          <span contenteditable=${isCompletionMode} ref=${container} oninput=${handleCompletionEdit}>
+            ${messages.flatMap(chatLine)}
+          </span>
+        </div>`;
+    };
+
+    const ConfigForm = (props) => {
+      const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
+      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
+      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
+      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+      const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }
+
+      const grammarJsonSchemaPropOrder = signal('')
+      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
+      const convertJSONSchemaGrammar = async () => {
+        try {
+          let schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter({
+            prop_order: grammarJsonSchemaPropOrder.value
+              .split(',')
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
+            allow_fetch: true,
+          })
+          schema = await converter.resolveRefs(schema, 'input')
+          converter.visit(schema, '')
+          params.value = {
+            ...params.value,
+            grammar: converter.formatGrammar(),
+          }
+        } catch (e) {
+          alert(`Convert failed: ${e.message}`)
+        }
+      }
+
+      const FloatField = ({ label, max, min, name, step, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const IntField = ({ label, max, min, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const BoolField = ({ label, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} />
+          </div>
+        `
+      };
+
+      const userTemplateReset = (e) => {
+        e.preventDefault();
+        userTemplateResetToDefaultAndApply()
+      }
+
+      const UserTemplateResetButton = () => {
+        if (selectedUserTemplate.value.name == 'default') {
+          return html`
+            <button disabled>Using default template</button>
+          `
+        }
+
+        return html`
+          <button onclick=${userTemplateReset}>Reset all to default</button>
+        `
+      };
+
+      useEffect(() => {
+        // autosave template on every change
+        userTemplateAutosave()
+      }, [session.value, params.value])
+
+      const GrammarControl = () => (
+        html`
+          <div>
+            <label for="template">Grammar</label>
+            <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+            <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+            <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+          </div>
+          `
+      );
+
+      const PromptControlFieldSet = () => (
+        html`
+        <fieldset>
+          <div>
+            <label htmlFor="prompt">Prompt</label>
+            <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
+          </div>
+        </fieldset>
+        `
+      );
+
+      const ChatConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+
+          <fieldset class="two">
+            <div>
+              <label for="user">User name</label>
+              <input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
+            </div>
+
+            <div>
+              <label for="bot">Bot name</label>
+              <input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
+            </div>
+          </fieldset>
+
+          <fieldset>
+            <div>
+              <label for="template">Prompt template</label>
+              <textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
+            </div>
+
+            <div>
+              <label for="template">Chat history template</label>
+              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
+            </div>
+            ${GrammarControl()}
+          </fieldset>
+      `
+      );
+
+      const CompletionConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+          <fieldset>${GrammarControl()}</fieldset>
+        `
+      );
+
+      return html`
+        <form>
+          <fieldset class="two">
+            <${UserTemplateResetButton}/>
+            <div>
+              <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
+              <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
+            </div>
+          </fieldset>
+
+          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
+
+          <fieldset class="two">
+            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
+            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
+            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
+            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
+            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
+            ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
+          </fieldset>
+          <details>
+            <summary>More options</summary>
+            <fieldset class="two">
+              ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+              ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
+              ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+              ${FloatField({ label: "DRY Penalty Multiplier", max: 5.0, min: 0.0, name: "dry_multiplier", step: 0.01, value: params.value.dry_multiplier })}
+              ${FloatField({ label: "DRY Base", max: 3.0, min: 1.0, name: "dry_base", step: 0.01, value: params.value.dry_base })}
+              ${IntField({ label: "DRY Allowed Length", max: 10, min: 2, step: 1, name: "dry_allowed_length", value: params.value.dry_allowed_length })}
+              ${IntField({ label: "DRY Penalty Last N", max: 2048, min: -1, step: 16, name: "dry_penalty_last_n", value: params.value.dry_penalty_last_n })}
+              ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
+              ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
+            </fieldset>
+            <hr />
+            <fieldset class="three">
+              <div>
+                <label><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> no Mirostat</label>
+                <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
+                <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
+              </div>
+              ${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
+              ${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
+            </fieldset>
+            <fieldset>
+              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
+            </fieldset>
+            <fieldset>
+              ${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
+            </fieldset>
+            <fieldset>
+              <label for="api_key">API Key</label>
+              <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
+            </fieldset>
+          </details>
+        </form>
+      `
+    }
+
+    const probColor = (p) => {
+      const r = Math.floor(192 * (1 - p));
+      const g = Math.floor(192 * p);
+      return `rgba(${r},${g},0,0.3)`;
+    }
+
+    const Probabilities = (params) => {
+      return params.data.map(msg => {
+        const { completion_probabilities } = msg;
+        if (
+          !completion_probabilities ||
+          completion_probabilities.length === 0
+        ) return msg.content
+
+        if (completion_probabilities.length > 1) {
+          // Not for byte pair
+          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
+
+          const splitData = completion_probabilities.map(prob => ({
+            content: prob.content,
+            completion_probabilities: [prob]
+          }))
+          return html`<${Probabilities} data=${splitData} />`
+        }
+
+        const { probs, content } = completion_probabilities[0]
+        const found = probs.find(p => p.tok_str === msg.content)
+        const pColor = found ? probColor(found.prob) : 'transparent'
+
+        const popoverChildren = html`
+          <div class="prob-set">
+            ${probs.map((p, index) => {
+          return html`
+                <div
+                  key=${index}
+                  title=${`prob: ${p.prob}`}
+                  style=${{
+              padding: '0.3em',
+              backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+            }}
+                >
+                  <span>${p.tok_str}: </span>
+                  <span>${Math.floor(p.prob * 100)}%</span>
+                </div>
+              `
+        })}
+          </div>
+        `
+
+        return html`
+          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
+            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
+          </>
+        `
+      });
+    }
+
+    // poor mans markdown replacement
+    const Markdownish = (params) => {
+      const chunks = params.text.split('```');
+
+      for (let i = 0; i < chunks.length; i++) {
+        if (i % 2 === 0) { // outside code block
+          chunks[i] = chunks[i]
+          .replace(/&/g, '&amp;')
+          .replace(/</g, '&lt;')
+          .replace(/>/g, '&gt;')
+          .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
+          .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+          .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+          .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+          .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+          .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
+          .replace(/`(.*?)`/g, '<code>$1</code>')
+          .replace(/\n/gim, '<br />');
+        } else { // inside code block
+          chunks[i] = `<pre><code>${chunks[i]}</code></pre>`;
+        }
+      }
+
+      const restoredText = chunks.join('');
+
+      return html`<span dangerouslySetInnerHTML=${{ __html: restoredText }} />`;
+    };
+
+    const ModelGenerationInfo = (params) => {
+      if (!llamaStats.value) {
+        return html`<span/>`
+      }
+      return html`
+        <span>
+          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+        </span>
+      `
+    }
+
+
+    // simple popover impl
+    const Popover = (props) => {
+      const isOpen = useSignal(false);
+      const position = useSignal({ top: '0px', left: '0px' });
+      const buttonRef = useRef(null);
+      const popoverRef = useRef(null);
+
+      const togglePopover = () => {
+        if (buttonRef.current) {
+          const rect = buttonRef.current.getBoundingClientRect();
+          position.value = {
+            top: `${rect.bottom + window.scrollY}px`,
+            left: `${rect.left + window.scrollX}px`,
+          };
+        }
+        isOpen.value = !isOpen.value;
+      };
+
+      const handleClickOutside = (event) => {
+        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
+          isOpen.value = false;
+        }
+      };
+
+      useEffect(() => {
+        document.addEventListener('mousedown', handleClickOutside);
+        return () => {
+          document.removeEventListener('mousedown', handleClickOutside);
+        };
+      }, []);
+
+      return html`
+        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
+        ${isOpen.value && html`
+          <${Portal} into="#portal">
+            <div
+              ref=${popoverRef}
+              class="popover-content"
+              style=${{
+            top: position.value.top,
+            left: position.value.left,
+          }}
+            >
+              ${props.popoverChildren}
+            </div>
+          </${Portal}>
+        `}
+      `;
+    };
+
+    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
+    /** Redirect rendering of descendants into the given CSS selector */
+    class Portal extends Component {
+      componentDidUpdate(props) {
+        for (let i in props) {
+          if (props[i] !== this.props[i]) {
+            return setTimeout(this.renderLayer);
+          }
+        }
+      }
+
+      componentDidMount() {
+        this.isMounted = true;
+        this.renderLayer = this.renderLayer.bind(this);
+        this.renderLayer();
+      }
+
+      componentWillUnmount() {
+        this.renderLayer(false);
+        this.isMounted = false;
+        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
+      }
+
+      findNode(node) {
+        return typeof node === 'string' ? document.querySelector(node) : node;
+      }
+
+      renderLayer(show = true) {
+        if (!this.isMounted) return;
+
+        // clean up old node if moving bases:
+        if (this.props.into !== this.intoPointer) {
+          this.intoPointer = this.props.into;
+          if (this.into && this.remote) {
+            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
+          }
+          this.into = this.findNode(this.props.into);
+        }
+
+        this.remote = render(html`
+          <${PortalProxy} context=${this.context}>
+            ${show && this.props.children || null}
+          </${PortalProxy}>
+        `, this.into, this.remote);
+      }
+
+      render() {
+        return null;
+      }
+    }
+    // high-order component that renders its first child if it exists.
+    // used as a conditional rendering proxy.
+    class PortalProxy extends Component {
+      getChildContext() {
+        return this.props.context;
+      }
+      render({ children }) {
+        return children || null;
+      }
+    }
+
+    function App(props) {
+      useEffect(() => {
+        const query = new URLSearchParams(location.search).get("q");
+        if (query) chat(query);
+      }, []);
+
+      return html`
+        <div class="mode-${session.value.type}">
+          <header>
+            <div class="grid-container">
+              <div class="grid-item"></div>
+              <div class="grid-item"><h1>llama.cpp</h1></div>
+              <div class="grid-item"><a class="customlink" href="index-new.html">New UI</a></div>
+            </div>
+          </header>
+
+          <main id="content">
+            <${chatStarted.value ? ChatLog : ConfigForm} />
+          </main>
+
+          <section id="write">
+            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
+          </section>
+
+          <footer>
+            <p><${ModelGenerationInfo} /></p>
+            <p>Powered by <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+          </footer>
+        </div>
+      `;
+    }
+
+    render(h(App), document.querySelector('#container'));
+  </script>
+</head>
+
+<body>
+  <div id="container">
+    <input type="file" id="fileInput" accept="image/*" style="display: none;">
+  </div>
+  <div id="portal"></div>
+</body>
+
+</html>
diff --git a/llama.cpp/tools/server/public_legacy/index.js b/llama.cpp/tools/server/public_legacy/index.js
new file mode 100644
index 0000000..32ec6e9
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/index.js
@@ -0,0 +1 @@
+const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function s(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.brand=t;f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(f.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new f(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=s(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},A=[],F=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){t&&t.parentNode&&t.parentNode.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;n<t.__k.length;n++)if(null!=(e=t.__k[n])&&null!=e.__e)return e.__e;return"function"==typeof t.type?B(t):null}function z(t){var n,e;if(null!=(t=t.__)&&null!=t.__c){for(t.__e=t.__c.base=null,n=0;n<t.__k.length;n++)if(null!=(e=t.__k[n])&&null!=e.__e){t.__e=t.__c.base=e.__e;break}return z(t)}}function G(t){(!t.__d&&(t.__d=!0)&&U.push(t)&&!J.__r++||E!==S.debounceRendering)&&((E=S.debounceRendering)||H)(J)}function J(){var t,n,e,_,i,o,r,u;for(U.sort(P);t=U.shift();)t.__d&&(n=U.length,_=void 0,o=(i=(e=t).__v).__e,r=[],u=[],e.__P&&((_=L({},i)).__v=i.__v+1,S.vnode&&S.vnode(_),_t(e.__P,_,i,e.__n,e.__P.namespaceURI,32&i.__u?[o]:null,r,null==o?B(i):o,!!(32&i.__u),u),_.__v=i.__v,_.__.__k[_.__i]=_,it(r,_,u),_.__e!=o&&z(_)),U.length>n&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,s,f){var c,h,a,p,d,v=_&&_.__k||A,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c<y;c++)null!=(a=e.__k[c])&&(h=-1===a.__i?M:v[a.__i]||M,a.__i=c,_t(t,a,h,i,o,r,u,l,s,f),p=a.__e,a.ref&&h.ref!=a.ref&&(h.ref&&rt(h.ref,null,a),f.push(a.ref,a.__c||p,a)),null==d&&null!=p&&(d=p),65536&a.__u||h.__k===a.__k?l=X(a,l,t):"function"==typeof a.type&&void 0!==a.__d?l=a.__d:p&&(l=p.nextSibling),a.__d=void 0,a.__u&=-196609);e.__d=l,e.__e=d}function Q(t,n,e){var _,i,o,r,u,l=n.length,s=e.length,f=s,c=0;for(t.__k=[],_=0;_<l;_++)null!=(i=n[_])&&"boolean"!=typeof i&&"function"!=typeof i?(r=_+c,(i=t.__k[_]="string"==typeof i||"number"==typeof i||"bigint"==typeof i||i.constructor==String?I(null,i,null,null,null):W(i)?I(j,{children:i},null,null,null):void 0===i.constructor&&i.__b>0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i).__=t,i.__b=t.__b+1,o=null,-1!==(u=i.__i=Z(i,e,r,f))&&(f--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:(u>r?c--:c++,i.__u|=65536))):i=t.__k[_]=null;if(f)for(_=0;_<s;_++)null!=(o=e[_])&&0==(131072&o.__u)&&(o.__e==t.__d&&(t.__d=B(o)),ut(o,o))}function X(t,n,e){var _,i;if("function"==typeof t.type){for(_=t.__k,i=0;_&&i<_.length;i++)_[i]&&(_[i].__=t,n=X(_[i],n,e));return n}t.__e!=n&&(n&&t.type&&!e.contains(n)&&(n=B(t)),e.insertBefore(t.__e,n||null),n=t.__e);do{n=n&&n.nextSibling}while(null!=n&&8===n.nodeType);return n}function Y(t,n){return n=n||[],null==t||"boolean"==typeof t||(W(t)?t.some((function(t){Y(t,n)})):n.push(t)),n}function Z(t,n,e,_){var i=t.key,o=t.type,r=e-1,u=e+1,l=n[e];if(null===l||l&&i==l.key&&o===l.type&&0==(131072&l.__u))return e;if(_>(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u<n.length;){if(r>=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u<n.length){if((l=n[u])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return u;u++}}return-1}function tt(t,n,e){"-"===n[0]?t.setProperty(n,null==e?"":e):t[n]=null==e?"":"number"!=typeof e||F.test(n)?e:e+"px"}function nt(t,n,e,_,i){var o;t:if("style"===n)if("string"==typeof e)t.style.cssText=e;else{if("string"==typeof _&&(t.style.cssText=_=""),_)for(n in _)e&&n in e||tt(t.style,n,"");if(e)for(n in e)_&&e[n]===_[n]||tt(t.style,n,e[n])}else if("o"===n[0]&&"n"===n[1])o=n!==(n=n.replace(/(PointerCapture)$|Capture$/i,"$1")),n=n.toLowerCase()in t||"onFocusOut"===n||"onFocusIn"===n?n.toLowerCase().slice(2):n.slice(2),t.l||(t.l={}),t.l[n+o]=e,e?_?e.u=_.u:(e.u=N,t.addEventListener(n,o?T:$,o)):t.removeEventListener(n,o?T:$,o);else{if("http://www.w3.org/2000/svg"==i)n=n.replace(/xlink(H|:h)/,"h").replace(/sName$/,"s");else if("width"!=n&&"height"!=n&&"href"!=n&&"list"!=n&&"form"!=n&&"tabIndex"!=n&&"download"!=n&&"rowSpan"!=n&&"colSpan"!=n&&"role"!=n&&"popover"!=n&&n in t)try{t[n]=null==e?"":e;break t}catch(t){}"function"==typeof e||(null==e||!1===e&&"-"!==n[4]?t.removeAttribute(n):t.setAttribute(n,"popover"==n&&1==e?"":e))}}function et(t){return function(n){if(this.l){var e=this.l[n.type+t];if(null==n.t)n.t=N++;else if(n.t<e.u)return;return e(S.event?S.event(n):n)}}}function _t(t,n,e,_,i,o,r,u,l,s){var f,c,h,a,p,d,v,y,m,g,b,k,w,x,C,U,E=n.type;if(void 0!==n.constructor)return null;128&e.__u&&(l=!!(32&e.__u),o=[u=n.__e=e.__e]),(f=S.__b)&&f(n);t:if("function"==typeof E)try{if(y=n.props,m="prototype"in E&&E.prototype.render,g=(f=E.contextType)&&_[f.__c],b=f?g?g.props.value:f.__:_,e.__c?v=(c=n.__c=e.__c).__=c.__E:(m?n.__c=c=new E(y,b):(n.__c=c=new q(y,b),c.constructor=E,c.render=lt),g&&g.sub(c),c.props=y,c.state||(c.state={}),c.context=b,c.__n=_,h=c.__d=!0,c.__h=[],c._sb=[]),m&&null==c.__s&&(c.__s=c.state),m&&null!=E.getDerivedStateFromProps&&(c.__s==c.state&&(c.__s=L({},c.__s)),L(c.__s,E.getDerivedStateFromProps(y,c.__s))),a=c.props,p=c.state,c.__v=n,h)m&&null==E.getDerivedStateFromProps&&null!=c.componentWillMount&&c.componentWillMount(),m&&null!=c.componentDidMount&&c.__h.push(c.componentDidMount);else{if(m&&null==E.getDerivedStateFromProps&&y!==a&&null!=c.componentWillReceiveProps&&c.componentWillReceiveProps(y,b),!c.__e&&(null!=c.shouldComponentUpdate&&!1===c.shouldComponentUpdate(y,c.__s,b)||n.__v===e.__v)){for(n.__v!==e.__v&&(c.props=y,c.state=c.__s,c.__d=!1),n.__e=e.__e,n.__k=e.__k,n.__k.some((function(t){t&&(t.__=n)})),k=0;k<c._sb.length;k++)c.__h.push(c._sb[k]);c._sb=[],c.__h.length&&r.push(c);break t}null!=c.componentWillUpdate&&c.componentWillUpdate(y,c.__s,b),m&&null!=c.componentDidUpdate&&c.__h.push((function(){c.componentDidUpdate(a,p,d)}))}if(c.context=b,c.props=y,c.__P=t,c.__e=!1,w=S.__r,x=0,m){for(c.state=c.__s,c.__d=!1,w&&w(n),f=c.render(c.props,c.state,c.context),C=0;C<c._sb.length;C++)c.__h.push(c._sb[C]);c._sb=[]}else do{c.__d=!1,w&&w(n),f=c.render(c.props,c.state,c.context),c.state=c.__s}while(c.__d&&++x<25);c.state=c.__s,null!=c.getChildContext&&(_=L(L({},_),c.getChildContext())),m&&!h&&null!=c.getSnapshotBeforeUpdate&&(d=c.getSnapshotBeforeUpdate(a,p)),K(t,W(U=null!=f&&f.type===j&&null==f.key?f.props.children:f)?U:[U],n,e,_,i,o,r,u,l,s),c.base=n.__e,n.__u&=-161,c.__h.length&&r.push(c),v&&(c.__E=c.__=null)}catch(t){if(n.__v=null,l||null!=o){for(n.__u|=l?160:128;u&&8===u.nodeType&&u.nextSibling;)u=u.nextSibling;o[o.indexOf(u)]=null,n.__e=u}else n.__e=e.__e,n.__k=e.__k;S.__e(t,n,e)}else null==o&&n.__v===e.__v?(n.__k=e.__k,n.__e=e.__e):n.__e=ot(e.__e,n,e,_,i,o,r,l,s);(f=S.diffed)&&f(n)}function it(t,n,e){n.__d=void 0;for(var _=0;_<e.length;_++)rt(e[_],e[++_],e[++_]);S.__c&&S.__c(n,t),t.some((function(n){try{t=n.__h,n.__h=[],t.some((function(t){t.call(n)}))}catch(t){S.__e(t,n.__v)}}))}function ot(t,n,e,_,i,o,r,u,l){var s,f,c,h,a,p,d,v=e.props,y=n.props,m=n.type;if("svg"===m?i="http://www.w3.org/2000/svg":"math"===m?i="http://www.w3.org/1998/Math/MathML":i||(i="http://www.w3.org/1999/xhtml"),null!=o)for(s=0;s<o.length;s++)if((a=o[s])&&"setAttribute"in a==!!m&&(m?a.localName===m:3===a.nodeType)){t=a,o[s]=null;break}if(null==t){if(null===m)return document.createTextNode(y);t=document.createElementNS(i,m,y.is&&y),u&&(S.__m&&S.__m(n,o),u=!1),o=null}if(null===m)v===y||u&&t.data===y||(t.data=y);else{if(o=o&&w.call(t.childNodes),v=e.props||M,!u&&null!=o)for(v={},s=0;s<t.attributes.length;s++)v[(a=t.attributes[s]).name]=a.value;for(s in v)if(a=v[s],"children"==s);else if("dangerouslySetInnerHTML"==s)c=a;else if(!(s in y)){if("value"==s&&"defaultValue"in y||"checked"==s&&"defaultChecked"in y)continue;nt(t,s,null,a,i)}for(s in y)a=y[s],"children"==s?h=a:"dangerouslySetInnerHTML"==s?f=a:"value"==s?p=a:"checked"==s?d=a:u&&"function"!=typeof a||v[s]===a||nt(t,s,a,v[s],i);if(f)u||c&&(f.__html===c.__html||f.__html===t.innerHTML)||(t.innerHTML=f.__html),n.__k=[];else if(c&&(t.innerHTML=""),K(t,W(h)?h:[h],n,e,_,"foreignObject"===m?"http://www.w3.org/1999/xhtml":i,o,r,o?o[0]:e.__k&&B(e,0),u,l),null!=o)for(s=o.length;s--;)O(o[s]);u||(s="value","progress"===m&&null==p?t.removeAttribute("value"):void 0!==p&&(p!==t[s]||"progress"===m&&!p||"option"===m&&p!==v[s])&&nt(t,s,p,v[s],i),s="checked",void 0!==d&&d!==t[s]&&nt(t,s,d,v[s],i))}return t}function rt(t,n,e){try{if("function"==typeof t){var _="function"==typeof t.__u;_&&t.__u(),_&&null==n||(t.__u=t(n))}else t.current=n}catch(t){S.__e(t,e)}}function ut(t,n,e){var _,i;if(S.unmount&&S.unmount(t),(_=t.ref)&&(_.current&&_.current!==t.__e||rt(_,null,n)),null!=(_=t.__c)){if(_.componentWillUnmount)try{_.componentWillUnmount()}catch(t){S.__e(t,n)}_.base=_.__P=null}if(_=t.__k)for(i=0;i<_.length;i++)_[i]&&ut(_[i],n,e||"function"!=typeof t.type);e||O(t.__e),t.__c=t.__=t.__e=t.__d=void 0}function lt(t,n,e){return this.constructor(t,e)}function st(t,n,e){var _,i,o,r;S.__&&S.__(t,n),i=(_="function"==typeof e)?null:e&&e.__k||n.__k,o=[],r=[],_t(n,t=(!_&&e||n).__k=R(j,null,[t]),i||M,M,n.namespaceURI,!_&&e?[e]:i?null:n.firstChild?w.call(n.childNodes):null,o,!_&&e?e:i?i.__e:n.firstChild,_,r),it(o,t,r)}function ft(t,n){st(t,n,ft)}function ct(t,n,e){var _,i,o,r,u=L({},t.props);for(o in t.type&&t.type.defaultProps&&(r=t.type.defaultProps),n)"key"==o?_=n[o]:"ref"==o?i=n[o]:u[o]=void 0===n[o]&&void 0!==r?r[o]:n[o];return arguments.length>2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=new Set,(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.forEach((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.add(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.delete(t),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=A.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&&gt.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function At(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function Ft(t,n){gt.useDebugValue&&gt.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&&gt.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof f){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o<n.length;o++){var r=n[o++],u=n[o]?(n[0]|=r?1:2,e[n[o++]]):n[++o];3===r?_[0]=u:4===r?_[1]=Object.assign(_[1]||{},u):5===r?(_[1]=_[1]||{})[n[++o]]=u:6===r?_[1][n[++o]]+=u+"":r?(i=t.apply(u,nn(t,u,e,["",null])),_.push(i),u[0]?n[0]|=2:(n[o-2]=0,n[o]=i)):_.push(u)}return _},en=new Map;function _n(t){var n=en.get(this);return n||(n=new Map,en.set(this,n)),(n=nn(this,n.get(t)||(n.set(t,n=function(t){for(var n,e,_=1,i="",o="",r=[0],u=function(t){1===_&&(t||(i=i.replace(/^\s*\n\s*|\s*\n\s*$/g,"")))?r.push(0,t,i):3===_&&(t||i)?(r.push(3,t,i),_=2):2===_&&"..."===i&&t?r.push(4,t,0):2===_&&i&&!t?r.push(5,0,!0,i):_>=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l<t.length;l++){l&&(1===_&&u(),u(l));for(var s=0;s<t[l].length;s++)n=t[l][s],1===_?"<"===n?(u(),r=[r],_=3):i+=n:4===_?"--"===i&&">"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][s+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,f as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,ft as hydrate,C as isValidElement,S as options,st as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,At as useContext,Ft as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState};
diff --git a/llama.cpp/tools/server/public_legacy/json-schema-to-grammar.mjs b/llama.cpp/tools/server/public_legacy/json-schema-to-grammar.mjs
new file mode 100644
index 0000000..38576c4
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/json-schema-to-grammar.mjs
@@ -0,0 +1,856 @@
+// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
+const SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}';
+
+function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
+  if (maxItems == 0) {
+    return '';
+  }
+  if (minItems === 0 && maxItems === 1) {
+    return `${itemRule}?`;
+  }
+
+
+  const separatorRule = opts.separatorRule ?? '';
+  const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
+
+  if (separatorRule === '') {
+    if (minItems === 1 && maxItems === undefined) {
+      return `${itemRule}+`;
+    } else if (minItems === 0 && maxItems === undefined) {
+      return `${itemRule}*`;
+    } else {
+      return `${itemRule}{${minItems},${maxItems !== undefined ? maxItems : ''}}`;
+    }
+  }
+
+  const result = itemRule + ' ' + _buildRepetition(`(${separatorRule} ${itemRule})`, minItems > 0 ? minItems - 1 : 0, maxItems !== undefined ? maxItems - 1 : undefined);
+  return minItems === 0 ? `(${result})?` : result;
+}
+
+function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) {
+  const hasMin = minValue !== null;
+  const hasMax = maxValue !== null;
+
+  function digitRange(fromChar, toChar) {
+      out.push("[");
+      if (fromChar === toChar) {
+          out.push(fromChar);
+      } else {
+          out.push(fromChar);
+          out.push("-");
+          out.push(toChar);
+      }
+      out.push("]");
+  }
+
+  function moreDigits(minDigits, maxDigits) {
+      out.push("[0-9]");
+      if (minDigits === maxDigits && minDigits === 1) {
+          return;
+      }
+      out.push("{");
+      out.push(minDigits.toString());
+      if (maxDigits !== minDigits) {
+          out.push(",");
+          if (maxDigits !== Number.MAX_SAFE_INTEGER) {
+              out.push(maxDigits.toString());
+          }
+      }
+      out.push("}");
+  }
+
+  function uniformRange(fromStr, toStr) {
+      let i = 0;
+      while (i < fromStr.length && fromStr[i] === toStr[i]) {
+          i++;
+      }
+      if (i > 0) {
+          out.push("\"");
+          out.push(fromStr.slice(0, i));
+          out.push("\"");
+      }
+      if (i < fromStr.length) {
+          if (i > 0) {
+              out.push(" ");
+          }
+          const subLen = fromStr.length - i - 1;
+          if (subLen > 0) {
+              const fromSub = fromStr.slice(i + 1);
+              const toSub = toStr.slice(i + 1);
+              const subZeros = "0".repeat(subLen);
+              const subNines = "9".repeat(subLen);
+
+              let toReached = false;
+              out.push("(");
+              if (fromSub === subZeros) {
+                  digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1));
+                  out.push(" ");
+                  moreDigits(subLen, subLen);
+              } else {
+                  out.push("[");
+                  out.push(fromStr[i]);
+                  out.push("] ");
+                  out.push("(");
+                  uniformRange(fromSub, subNines);
+                  out.push(")");
+                  if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) {
+                      out.push(" | ");
+                      if (toSub === subNines) {
+                          digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]);
+                          toReached = true;
+                      } else {
+                          digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1));
+                      }
+                      out.push(" ");
+                      moreDigits(subLen, subLen);
+                  }
+              }
+              if (!toReached) {
+                  out.push(" | ");
+                  digitRange(toStr[i], toStr[i]);
+                  out.push(" ");
+                  uniformRange(subZeros, toSub);
+              }
+              out.push(")");
+          } else {
+              out.push("[");
+              out.push(fromStr[i]);
+              out.push("-");
+              out.push(toStr[i]);
+              out.push("]");
+          }
+      }
+  }
+
+  if (hasMin && hasMax) {
+      if (minValue < 0 && maxValue < 0) {
+          out.push("\"-\" (");
+          _generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true);
+          out.push(")");
+          return;
+      }
+
+      if (minValue < 0) {
+          out.push("\"-\" (");
+          _generateMinMaxInt(0, -minValue, out, decimalsLeft, true);
+          out.push(") | ");
+          minValue = 0;
+      }
+
+      let minS = minValue.toString();
+      const maxS = maxValue.toString();
+      const minDigits = minS.length;
+      const maxDigits = maxS.length;
+
+      for (let digits = minDigits; digits < maxDigits; digits++) {
+          uniformRange(minS, "9".repeat(digits));
+          minS = "1" + "0".repeat(digits);
+          out.push(" | ");
+      }
+      uniformRange(minS, maxS);
+      return;
+  }
+
+  const lessDecimals = Math.max(decimalsLeft - 1, 1);
+
+  if (hasMin) {
+      if (minValue < 0) {
+          out.push("\"-\" (");
+          _generateMinMaxInt(null, -minValue, out, decimalsLeft, false);
+          out.push(") | [0] | [1-9] ");
+          moreDigits(0, decimalsLeft - 1);
+      } else if (minValue === 0) {
+          if (topLevel) {
+              out.push("[0] | [1-9] ");
+              moreDigits(0, lessDecimals);
+          } else {
+              moreDigits(1, decimalsLeft);
+          }
+      } else if (minValue <= 9) {
+          const c = minValue.toString();
+          const range_start = topLevel ? '1' : '0';
+          if (c > range_start) {
+              digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1));
+              out.push(" ");
+              moreDigits(1, lessDecimals);
+              out.push(" | ");
+          }
+          digitRange(c, "9");
+          out.push(" ");
+          moreDigits(0, lessDecimals);
+      } else {
+          const minS = minValue.toString();
+          const length = minS.length;
+          const c = minS[0];
+
+          if (c > "1") {
+              digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1));
+              out.push(" ");
+              moreDigits(length, lessDecimals);
+              out.push(" | ");
+          }
+          digitRange(c, c);
+          out.push(" (");
+          _generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false);
+          out.push(")");
+          if (c < "9") {
+              out.push(" | ");
+              digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9");
+              out.push(" ");
+              moreDigits(length - 1, lessDecimals);
+          }
+      }
+      return;
+  }
+
+  if (hasMax) {
+      if (maxValue >= 0) {
+          if (topLevel) {
+              out.push("\"-\" [1-9] ");
+              moreDigits(0, lessDecimals);
+              out.push(" | ");
+          }
+          _generateMinMaxInt(0, maxValue, out, decimalsLeft, true);
+      } else {
+          out.push("\"-\" (");
+          _generateMinMaxInt(-maxValue, null, out, decimalsLeft, false);
+          out.push(")");
+      }
+      return;
+  }
+
+  throw new Error("At least one of minValue or maxValue must be set");
+}
+
+class BuiltinRule {
+  constructor(content, deps) {
+    this.content = content;
+    this.deps = deps || [];
+  }
+}
+
+const PRIMITIVE_RULES = {
+  boolean        : new BuiltinRule('("true" | "false") space', []),
+  'decimal-part' : new BuiltinRule('[0-9]{1,16}', []),
+  'integral-part': new BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
+  number         : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
+  integer        : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
+  value          : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
+  object         : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
+  array          : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
+  uuid           : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
+  char           : new BuiltinRule(`[^"\\\\\\x7F\\x00-\\x1F] | [\\\\] (["\\\\bfnrt] | "u" [0-9a-fA-F]{4})`, []),
+  string         : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
+  null           : new BuiltinRule('"null" space', []),
+};
+
+// TODO: support "uri", "email" string formats
+const STRING_FORMAT_RULES = {
+  'date'            : new BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+  'time'            : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+  'date-time'       : new BuiltinRule('date "T" time', ['date', 'time']),
+  'date-string'     : new BuiltinRule('"\\"" date "\\"" space', ['date']),
+  'time-string'     : new BuiltinRule('"\\"" time "\\"" space', ['time']),
+  'date-time-string': new BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
+}
+
+const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...STRING_FORMAT_RULES};
+
+const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
+const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"\\]/g;
+const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
+const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]', '\\': '\\\\' };
+
+const NON_LITERAL_SET = new Set('|.()[]{}*+?');
+const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?');
+
+export class SchemaConverter {
+  constructor(options) {
+    this._propOrder = options.prop_order || {};
+    this._allowFetch = options.allow_fetch || false;
+    this._dotall = options.dotall || false;
+    this._rules = {'space': SPACE_RULE};
+    this._refs = {};
+    this._refsBeingResolved = new Set();
+  }
+
+  _formatLiteral(literal) {
+    const escaped = literal.replace(
+      GRAMMAR_LITERAL_ESCAPE_RE,
+      m => GRAMMAR_LITERAL_ESCAPES[m]
+    );
+    return `"${escaped}"`;
+  }
+
+  _formatRangeChar(literal) {
+    return JSON.stringify(literal).slice(1, -1).replace(
+      GRAMMAR_RANGE_LITERAL_ESCAPE_RE,
+      m => GRAMMAR_LITERAL_ESCAPES[m]
+    );
+  }
+
+  _addRule(name, rule) {
+    let escName = name.replace(INVALID_RULE_CHARS_RE, '-');
+    let key = escName;
+
+    if (escName in this._rules) {
+      if (this._rules[escName] === rule) {
+        return key;
+      }
+
+      let i = 0;
+      while ((`${escName}${i}` in this._rules) && (this._rules[`${escName}${i}`] !== rule)) {
+        i += 1;
+      }
+      key = `${escName}${i}`;
+    }
+
+    this._rules[key] = rule;
+    return key;
+  }
+
+  async resolveRefs(schema, url) {
+    const visit = async (n) => {
+      if (Array.isArray(n)) {
+        return Promise.all(n.map(visit));
+      } else if (typeof n === 'object' && n !== null) {
+        let ref = n.$ref;
+        let target;
+        if (ref !== undefined && !this._refs[ref]) {
+          if (ref.startsWith('https://')) {
+            if (!this._allowFetch) {
+              throw new Error('Fetching remote schemas is not allowed (use --allow-fetch for force)');
+            }
+            const fetch = (await import('node-fetch')).default;
+
+            const fragSplit = ref.split('#');
+            const baseUrl = fragSplit[0];
+
+            target = this._refs[baseUrl];
+            if (!target) {
+              target = await this.resolveRefs(await fetch(ref).then(res => res.json()), baseUrl);
+              this._refs[baseUrl] = target;
+            }
+
+            if (fragSplit.length === 1 || fragSplit[fragSplit.length - 1] === '') {
+              return target;
+            }
+          } else if (ref.startsWith('#/')) {
+            target = schema;
+            ref = `${url}${ref}`;
+            n.$ref = ref;
+          } else {
+            throw new Error(`Unsupported ref ${ref}`);
+          }
+
+          const selectors = ref.split('#')[1].split('/').slice(1);
+          for (const sel of selectors) {
+            const selIndex = parseInt(sel, 10);
+            if (target && sel in target) {
+              target = target[sel];
+            } else if (target && selIndex in target) {
+              target = target[selIndex];
+            } else {
+              throw new Error(`Error resolving ref ${ref}: ${sel} not in ${JSON.stringify(target)}`);
+            }
+          }
+
+          this._refs[ref] = target;
+        } else {
+          await Promise.all(Object.values(n).map(visit));
+        }
+      }
+
+      return n;
+    };
+
+    return visit(schema);
+  }
+
+  _generateUnionRule(name, altSchemas) {
+    return altSchemas
+      .map((altSchema, i) => this.visit(altSchema, `${name ?? ''}${name ? '-' : 'alternative-'}${i}`))
+      .join(' | ');
+  }
+
+  _visitPattern(pattern, name) {
+    if (!pattern.startsWith('^') || !pattern.endsWith('$')) {
+      throw new Error('Pattern must start with "^" and end with "$"');
+    }
+    pattern = pattern.slice(1, -1);
+    const subRuleIds = {};
+
+    let i = 0;
+    const length = pattern.length;
+
+    const getDot = () => {
+      let rule;
+      if (this._dotall) {
+        rule = '[\\U00000000-\\U0010FFFF]';
+      } else {
+        // Accept any character... except \n and \r line break chars (\x0A and \xOD)
+        rule = '[^\\x0A\\x0D]';
+      }
+      return this._addRule('dot', rule);
+    };
+
+
+    const toRule = ([s, isLiteral]) => isLiteral ? "\"" + s + "\"" : s;
+
+    const transform = () => {
+      const start = i;
+      // For each component of this sequence, store its string representation and whether it's a literal.
+      // We only need a flat structure here to apply repetition operators to the last item, and
+      // to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
+      // (GBNF's syntax is luckily very close to regular expressions!)
+      const seq = [];
+
+      const joinSeq = () => {
+        const ret = [];
+        for (const [isLiteral, g] of groupBy(seq, x => x[1])) {
+          if (isLiteral) {
+            ret.push([[...g].map(x => x[0]).join(''), true]);
+          } else {
+            ret.push(...g);
+          }
+        }
+        if (ret.length === 1) {
+          return ret[0];
+        }
+        return [ret.map(x => toRule(x)).join(' '), false];
+      };
+
+      while (i < length) {
+        const c = pattern[i];
+        if (c === '.') {
+          seq.push([getDot(), false]);
+          i += 1;
+        } else if (c === '(') {
+          i += 1;
+          if (i < length) {
+            if (pattern[i] === '?') {
+              throw new Error(`Unsupported pattern syntax "${pattern[i]}" at index ${i} of /${pattern}/`);
+            }
+          }
+          seq.push([`(${toRule(transform())})`, false]);
+        } else if (c === ')') {
+          i += 1;
+          if (start <= 0 || pattern[start - 1] !== '(') {
+            throw new Error(`Unbalanced parentheses; start = ${start}, i = ${i}, pattern = ${pattern}`);
+          }
+          return joinSeq();
+        } else if (c === '[') {
+          let squareBrackets = c;
+          i += 1;
+          while (i < length && pattern[i] !== ']') {
+            if (pattern[i] === '\\') {
+              squareBrackets += pattern.slice(i, i + 2);
+              i += 2;
+            } else {
+              squareBrackets += pattern[i];
+              i += 1;
+            }
+          }
+          if (i >= length) {
+            throw new Error(`Unbalanced square brackets; start = ${start}, i = ${i}, pattern = ${pattern}`);
+          }
+          squareBrackets += ']';
+          i += 1;
+          seq.push([squareBrackets, false]);
+        } else if (c === '|') {
+          seq.push(['|', false]);
+          i += 1;
+        } else if (c === '*' || c === '+' || c === '?') {
+          seq[seq.length - 1] = [toRule(seq[seq.length - 1]) + c, false];
+          i += 1;
+        } else if (c === '{') {
+          let curlyBrackets = c;
+          i += 1;
+          while (i < length && pattern[i] !== '}') {
+            curlyBrackets += pattern[i];
+            i += 1;
+          }
+          if (i >= length) {
+            throw new Error(`Unbalanced curly brackets; start = ${start}, i = ${i}, pattern = ${pattern}`);
+          }
+          curlyBrackets += '}';
+          i += 1;
+          const nums = curlyBrackets.slice(1, -1).split(',').map(s => s.trim());
+          let minTimes, maxTimes;
+          if (nums.length === 1) {
+            minTimes = parseInt(nums[0], 10);
+            maxTimes = minTimes;
+          } else {
+            if (nums.length !== 2) {
+              throw new Error(`Invalid quantifier ${curlyBrackets}`);
+            }
+            minTimes = nums[0] ? parseInt(nums[0], 10) : 0;
+            maxTimes = nums[1] ? parseInt(nums[1], 10) : Infinity;
+          }
+
+          let [sub, subIsLiteral] = seq[seq.length - 1];
+
+          if (!subIsLiteral) {
+            let id = subRuleIds[sub];
+            if (id === undefined) {
+              id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
+              subRuleIds[sub] = id;
+            }
+            sub = id;
+          }
+
+          seq[seq.length - 1] = [
+            _buildRepetition(subIsLiteral ? `"${sub}"` : sub, minTimes, maxTimes, {itemRuleIsLiteral: subIsLiteral}),
+            false
+          ];
+        } else {
+          let literal = '';
+          while (i < length) {
+            if (pattern[i] === '\\' && i < length - 1) {
+              const next = pattern[i + 1];
+              if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(next)) {
+                i += 1;
+                literal += pattern[i];
+                i += 1;
+              } else {
+                literal += pattern.slice(i, i + 2);
+                i += 2;
+              }
+            } else if (pattern[i] === '"') {
+              literal += '\\"';
+              i += 1;
+            } else if (!NON_LITERAL_SET.has(pattern[i]) &&
+                (i === length - 1 || literal === '' || pattern[i + 1] === '.' || !NON_LITERAL_SET.has(pattern[i+1]))) {
+              literal += pattern[i];
+              i += 1;
+            } else {
+              break;
+            }
+          }
+          if (literal !== '') {
+            seq.push([literal, true]);
+          }
+        }
+      }
+
+      return joinSeq();
+    };
+
+    return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space")
+  }
+
+  _notStrings(strings) {
+    class TrieNode {
+      constructor() {
+        this.children = {};
+        this.isEndOfString = false;
+      }
+
+      insert(str) {
+        let node = this;
+        for (const c of str) {
+          node = node.children[c] = node.children[c] || new TrieNode();
+        }
+        node.isEndOfString = true;
+      }
+    }
+
+    const trie = new TrieNode();
+    for (const s of strings) {
+      trie.insert(s);
+    }
+
+    const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
+    const out = ['["] ( '];
+
+    const visit = (node) => {
+      const rejects = [];
+      let first = true;
+      for (const c of Object.keys(node.children).sort()) {
+        const child = node.children[c];
+        rejects.push(c);
+        if (first) {
+          first = false;
+        } else {
+          out.push(' | ');
+        }
+        out.push(`[${c}]`);
+        if (Object.keys(child.children).length > 0) {
+          out.push(' (');
+          visit(child);
+          out.push(')');
+        } else if (child.isEndOfString) {
+          out.push(` ${charRuleName}+`);
+        }
+      }
+      if (Object.keys(node.children).length > 0) {
+        if (!first) {
+          out.push(' | ');
+        }
+        out.push(`[^"${rejects.join('')}] ${charRuleName}*`);
+      }
+    };
+
+    visit(trie);
+
+    out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`);
+    return out.join('');
+  }
+
+  _resolveRef(ref) {
+    let refFragment = ref.split('#').pop();
+    let refName = 'ref' + refFragment.replace(/[^a-zA-Z0-9-]+/g, '-');
+    if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
+      this._refsBeingResolved.add(ref);
+      const resolved = this._refs[ref];
+      refName = this.visit(resolved, refName);
+      this._refsBeingResolved.delete(ref);
+    }
+    return refName;
+  }
+
+  _generateConstantRule(value) {
+    return this._formatLiteral(JSON.stringify(value));
+  }
+
+  visit(schema, name) {
+    const schemaType = schema.type;
+    const schemaFormat = schema.format;
+    const ruleName = name in RESERVED_NAMES ? name + '-' : name == '' ? 'root' : name;
+
+    const ref = schema.$ref;
+    if (ref !== undefined) {
+      return this._addRule(ruleName, this._resolveRef(ref));
+    } else if (schema.oneOf || schema.anyOf) {
+      return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
+    } else if (Array.isArray(schemaType)) {
+      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({...schema, type: t}))));
+    } else if ('const' in schema) {
+      return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space');
+    } else if ('enum' in schema) {
+      const rule = '(' + schema.enum.map(v => this._generateConstantRule(v)).join(' | ') + ') space';
+      return this._addRule(ruleName, rule);
+    } else if ((schemaType === undefined || schemaType === 'object') &&
+               ('properties' in schema ||
+                ('additionalProperties' in schema && schema.additionalProperties !== true))) {
+      const required = new Set(schema.required || []);
+      const properties = Object.entries(schema.properties ?? {});
+      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, schema.additionalProperties));
+    } else if ((schemaType === undefined || schemaType === 'object' || schemaType === 'string') && 'allOf' in schema) {
+      const required = new Set();
+      const properties = [];
+      const enumSets = [];
+      const addComponent = (compSchema, isRequired) => {
+        const ref = compSchema.$ref;
+        if (ref !== undefined) {
+          compSchema = this._refs[ref];
+        }
+
+        if ('properties' in compSchema) {
+          for (const [propName, propSchema] of Object.entries(compSchema.properties)) {
+            properties.push([propName, propSchema]);
+            if (isRequired) {
+              required.add(propName);
+            }
+          }
+        }
+
+        if ('enum' in compSchema) {
+          enumSets.push(new Set(compSchema.enum || []));
+        }
+      };
+
+      for (const t of schema.allOf) {
+        if ('anyOf' in t) {
+          for (const tt of t.anyOf) {
+            addComponent(tt, false);
+          }
+        } else {
+          addComponent(t, true);
+        }
+      }
+
+      if (enumSets.length > 0) {
+        const enumIntersection = new Set([...enumSets[0]].filter(v => enumSets.every(s => s.has(v))));
+        if (enumIntersection.size > 0) {
+          const sortedEnums = [...enumIntersection].sort((a, b) => a.localeCompare(b));
+          const rule = '(' + sortedEnums.map(v => this._generateConstantRule(v)).join(' | ') + ') space';
+          return this._addRule(ruleName, rule);
+        }
+      }
+      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null));
+    } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
+      const items = schema.items ?? schema.prefixItems;
+      if (Array.isArray(items)) {
+        return this._addRule(
+          ruleName,
+          '"[" space ' +
+            items.map((item, i) => this.visit(item, `${name ?? ''}${name ? '-' : ''}tuple-${i}`)).join(' "," space ') +
+            ' "]" space'
+        );
+      } else {
+        const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
+        const minItems = schema.minItems || 0;
+        const maxItems = schema.maxItems;
+        return this._addRule(ruleName, '"[" space ' + _buildRepetition(itemRuleName, minItems, maxItems, {separatorRule: '"," space'}) + ' "]" space');
+      }
+    } else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
+      return this._visitPattern(schema.pattern, ruleName);
+    } else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
+      return this._addPrimitive(
+        ruleName === 'root' ? 'root' : schemaFormat,
+        PRIMITIVE_RULES['uuid']
+      );
+    } else if ((schemaType === undefined || schemaType === 'string') && `${schema.format}-string` in STRING_FORMAT_RULES) {
+      const primName = `${schema.format}-string`
+      return this._addRule(ruleName, this._addPrimitive(primName, STRING_FORMAT_RULES[primName]));
+    } else if (schemaType === 'string' && ('minLength' in schema || 'maxLength' in schema)) {
+      const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
+      const minLen = schema.minLength || 0;
+      const maxLen = schema.maxLength;
+      return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
+    } else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) {
+      let minValue = null;
+      let maxValue = null;
+      if ('minimum' in schema) {
+        minValue = schema.minimum;
+      } else if ('exclusiveMinimum' in schema) {
+        minValue = schema.exclusiveMinimum + 1;
+      }
+      if ('maximum' in schema) {
+        maxValue = schema.maximum;
+      } else if ('exclusiveMaximum' in schema) {
+        maxValue = schema.exclusiveMaximum - 1;
+      }
+
+      const out = ["("];
+      _generateMinMaxInt(minValue, maxValue, out);
+      out.push(") space");
+      return this._addRule(ruleName, out.join(''));
+    } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
+      return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
+    } else {
+      if (!(schemaType in PRIMITIVE_RULES)) {
+        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
+      }
+      // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
+      return this._addPrimitive(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
+    }
+  }
+
+  _addPrimitive(name, rule) {
+    let n = this._addRule(name, rule.content);
+    for (const dep of rule.deps) {
+      const depRule = PRIMITIVE_RULES[dep] || STRING_FORMAT_RULES[dep];
+      if (!depRule) {
+        throw new Error(`Rule ${dep} not known`);
+      }
+      if (!(dep in this._rules)) {
+        this._addPrimitive(dep, depRule);
+      }
+    }
+    return n;
+  }
+
+  _buildObjectRule(properties, required, name, additionalProperties) {
+    const propOrder = this._propOrder;
+    // sort by position in prop_order (if specified) then by original order
+    const sortedProps = properties.map(([k]) => k).sort((a, b) => {
+      const orderA = propOrder[a] || Infinity;
+      const orderB = propOrder[b] || Infinity;
+      return orderA - orderB || properties.findIndex(([k]) => k === a) - properties.findIndex(([k]) => k === b);
+    });
+
+    const propKvRuleNames = {};
+    for (const [propName, propSchema] of properties) {
+      const propRuleName = this.visit(propSchema, `${name ?? ''}${name ? '-' : ''}${propName}`);
+      propKvRuleNames[propName] = this._addRule(
+        `${name ?? ''}${name ? '-' : ''}${propName}-kv`,
+        `${this._formatLiteral(JSON.stringify(propName))} space ":" space ${propRuleName}`
+      );
+    }
+    const requiredProps = sortedProps.filter(k => required.has(k));
+    const optionalProps = sortedProps.filter(k => !required.has(k));
+
+    if (additionalProperties) {
+      const subName = `${name ?? ''}${name ? '-' : ''}additional`;
+      const valueRule =
+        additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
+        : this._addPrimitive('value', PRIMITIVE_RULES['value']);
+
+      const key_rule =
+        sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string'])
+        : this._addRule(`${subName}-k`, this._notStrings(sortedProps));
+
+      propKvRuleNames['*'] = this._addRule(
+        `${subName}-kv`,
+        `${key_rule} ":" space ${valueRule}`);
+      optionalProps.push('*');
+    }
+
+    let rule = '"{" space ';
+    rule += requiredProps.map(k => propKvRuleNames[k]).join(' "," space ');
+
+    if (optionalProps.length > 0) {
+      rule += ' (';
+      if (requiredProps.length > 0) {
+        rule += ' "," space ( ';
+      }
+
+      const getRecursiveRefs = (ks, firstIsOptional) => {
+        const [k, ...rest] = ks;
+        const kvRuleName = propKvRuleNames[k];
+        let res;
+        const commaRef = `( "," space ${kvRuleName} )`;
+        if (firstIsOptional) {
+          res = commaRef + (k === '*' ? '*' : '?');
+        } else {
+          res = kvRuleName + (k === '*' ? ' ' + commaRef + '*' : '');
+        }
+        if (rest.length > 0) {
+          res += ' ' + this._addRule(
+            `${name ?? ''}${name ? '-' : ''}${k}-rest`,
+            getRecursiveRefs(rest, true)
+          );
+        }
+        return res;
+      };
+
+      rule += optionalProps.map((_, i) => getRecursiveRefs(optionalProps.slice(i), false)).join(' | ');
+      if (requiredProps.length > 0) {
+        rule += ' )';
+      }
+      rule += ' )?';
+    }
+
+    rule += ' "}" space';
+
+    return rule;
+  }
+
+  formatGrammar() {
+    let grammar = '';
+    for (const [name, rule] of Object.entries(this._rules).sort(([a], [b]) => a.localeCompare(b))) {
+      grammar += `${name} ::= ${rule}\n`;
+    }
+    return grammar;
+  }
+}
+
+// Helper function to group elements by a key function
+function* groupBy(iterable, keyFn) {
+  let lastKey = null;
+  let group = [];
+  for (const element of iterable) {
+    const key = keyFn(element);
+    if (lastKey !== null && key !== lastKey) {
+      yield [lastKey, group];
+      group = [];
+    }
+    group.push(element);
+    lastKey = key;
+  }
+  if (group.length > 0) {
+    yield [lastKey, group];
+  }
+}
diff --git a/llama.cpp/tools/server/public_legacy/loading.html b/llama.cpp/tools/server/public_legacy/loading.html
new file mode 100644
index 0000000..c3fd19a
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/loading.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta http-equiv="refresh" content="5">
+    </head>
+    <body>
+        <div id="loading">
+            The model is loading. Please wait.<br/>
+            The user interface will appear soon.
+        </div>
+    </body>
+</html>
diff --git a/llama.cpp/tools/server/public_legacy/prompt-formats.js b/llama.cpp/tools/server/public_legacy/prompt-formats.js
new file mode 100644
index 0000000..73ddb71
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/prompt-formats.js
@@ -0,0 +1,331 @@
+// extended list
+export const promptFormats = {
+  "alpaca": {
+  template: `{{prompt}}\n\n{{history}}\n\n{{char}}:`,
+
+  historyTemplate: `### {{name}}:\n{{message}}`,
+
+  char: "Response",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "Instruction",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "chatml": {
+  template: `<|im_start|>system\n{{prompt}}<|im_end|>\n{{history}}{{char}}`,
+
+  historyTemplate: `<|im_start|>{{name}}\n{{message}}`,
+
+  char: "assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "user",
+  userMsgPrefix: "",
+  userMsgSuffix: "<|im_end|>\n",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "commandr": {
+  template: `<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{prompt}}\n<|END_OF_TURN_TOKEN|>{{history}}{{char}}`,
+
+  historyTemplate: `<|START_OF_TURN_TOKEN|><|{{name}}|> {{message}}`,
+
+  char: "CHATBOT_TOKEN",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "USER_TOKEN",
+  userMsgPrefix: "",
+  userMsgSuffix: "<|END_OF_TURN_TOKEN|>",
+
+  stops: ""
+  },
+  // ref: https://docs.cohere.com/docs/prompting-command-r
+
+  // ----------------------------
+
+  "llama2": {
+  template: `<s>[INST] <<SYS>>\n{{prompt}}\n<</SYS>>\n\nTest Message [/INST] Test Successfull </s>{{history}}{{char}}`,
+
+  historyTemplate: `{{name}}: {{message}}`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "</s>",
+
+  user: "User",
+  userMsgPrefix: "<s>[INST] ",
+  userMsgSuffix: " [/INST]",
+
+  stops: ""
+  },
+  // ref: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
+
+  // ----------------------------
+
+  "llama3": {
+  template: `<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{prompt}}{{history}}{{char}}`,
+
+  historyTemplate: `<|start_header_id|>{{name}}<|end_header_id|>\n\n{{message}}<|eot_id|>`,
+
+  char: "assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "user",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: "<|eot_id|>"
+  },
+  // ref: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/#special-tokens-used-with-meta-llama-3
+
+  // ----------------------------
+
+  "openchat": {
+  template: `{{history}}{{char}}`,
+
+  historyTemplate: `GPT4 Correct {{name}}: {{message}}<|end_of_turn|>`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "User",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "phi3": {
+  template: `{{history}}{{char}}`,
+
+  historyTemplate: `<|{{name}}|>\n{{message}}<|end|>\n`,
+
+  char: "assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "user",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: "<|end|>"
+  },
+  // ref: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct#chat-format
+
+  // ----------------------------
+
+  "vicuna": {
+  template: `{{prompt}}\n{{history}}{{char}}`,
+
+  historyTemplate: `{{name}}: {{message}}\n`,
+
+  char: "ASSISTANT",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "USER",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+  // ref: https://huggingface.co/lmsys/vicuna-33b-v1.3/discussions/1
+
+  // ----------------------------
+
+  "deepseekCoder": {
+  template: `{{prompt}}{{history}}{{char}}:`,
+
+  historyTemplate: `### {{name}}:\n{{message}}`,
+
+  char: "Response",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "Instruction",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: "<|EOT|>"
+  },
+
+  // ----------------------------
+
+  "med42": {
+  template: `<|system|>: {{prompt}}\n{{history}}{{char}}`,
+
+  historyTemplate: `<|{{name}}|>: {{message}}\n`,
+
+  char: "assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "prompter",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "neuralchat": {
+  template: `### System:\n{{prompt}}\n{{history}}{{char}}:`,
+
+  historyTemplate: `### {{name}}:\n{{message}}\n`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "User",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "nousHermes": {
+  template: `### Instruction: {{prompt}}\n\n{{history}}\n\n{{char}}:`,
+
+  historyTemplate: `### {{name}}:\n{{message}}`,
+
+  char: "Response",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "Input",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "openchatMath": {
+  template: `{{history}}{{char}}`,
+
+  historyTemplate: `Math Correct {{name}}: {{message}}<|end_of_turn|>`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+
+  user: "User",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "orion": {
+  template: `<s>Human: Test Message\n\nAssistant: </s>Test Successful</s>{{history}}{{char}}:`,
+
+  historyTemplate: `{{name}}: {{message}}`,
+
+  char: "Assistant </s>",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "Human",
+  userMsgPrefix: "",
+  userMsgSuffix: "\n\n",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "sauerkraut": {
+  template: `{{prompt}}\n{{history}}{{char}}`,
+
+  historyTemplate: `
+  {{name}}: {{message}}\n`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "User",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "starlingCode": {
+  template: `{{history}}{{char}}`,
+
+  historyTemplate: `Code {{name}}: {{message}}<|end_of_turn|>`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "User",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "yi34b": {
+  template: `{{history}} {{char}}`,
+
+  historyTemplate: `{{name}}: {{message}}`,
+
+  char: "Assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "Human",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  },
+
+  // ----------------------------
+
+  "zephyr": {
+  template: `<|system|>\n{{prompt}}</s>\n{{history}}{{char}}`,
+
+  historyTemplate: `<|{{name}}|>\n{{message}}</s>\n`,
+
+  char: "assistant",
+  charMsgPrefix: "",
+  charMsgSuffix: "",
+
+  user: "user",
+  userMsgPrefix: "",
+  userMsgSuffix: "",
+
+  stops: ""
+  }
+  };
diff --git a/llama.cpp/tools/server/public_legacy/style.css b/llama.cpp/tools/server/public_legacy/style.css
new file mode 100644
index 0000000..087cc62
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/style.css
@@ -0,0 +1,954 @@
+@import url("colorthemes.css");
+
+body {
+  font-family: 'Arial', sans-serif;
+  font-size: 90%;
+  background-color: var(--background-color-1);
+  color: var(--text-color-subtile-1); /* head 1 llama.cpp & triangle options for some reason */
+  max-width: 600px;
+  min-width: 300px;
+  line-height: 1.2;
+  margin: 0 auto;
+  padding: 0 0.5em;
+  transition: background-color 0.3s;
+}
+
+::selection {
+  color: var(--button-primary-text) ;
+  background: var(--button-primary-color);
+}
+
+code, pre code {
+  font-family: 'Courier New', monospace;
+}
+
+#container {
+  margin: 0em auto;
+  display: flex;
+  flex-direction: column;
+  justify-content: space-between;
+  height: 100%;
+}
+
+main {
+  margin: 3px;
+  display: flex;
+  flex-direction: column;
+  justify-content: space-between;
+  gap: 1em;
+  flex-grow: 1;
+  overflow-y: auto;
+  border: 1px solid var(--border-color-3);
+  border-radius: 5px;
+  padding: 0.5em;
+}
+
+p {
+  overflow-wrap: break-word;
+  word-wrap: break-word;
+  hyphens: auto;
+  margin-top: 0.5em;
+  margin-bottom: 0.5em;
+}
+
+#write form {
+  margin: 1em 0 0 0;
+  display: flex;
+  flex-direction: column;
+  gap: 0.5em;
+  align-items: stretch;
+}
+
+.right {
+  display: flex;
+  flex-direction: row;
+  gap: 0.5em;
+  justify-content: flex-end;
+  margin-bottom: 30px;
+}
+
+.two-columns {
+  width: 97%;
+  max-width: 97%;
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: 1em;
+  position: relative;
+}
+
+.json-schema-controls {
+  margin-top: 10px;
+  width: 100%;
+  max-width: 100%;
+  display: grid;
+  grid-template: "a a";
+  gap: 1em;
+  font-size: x-small;
+  color: var(--theme-nuance-color-3);
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+}
+
+.json-schema-controls > * {
+  flex: 1;
+}
+
+/* titles of the details-summary boxes */
+.summary-title {
+  font-weight: 600;
+  font-size: x-small;
+  color: var(--text-color-subtile-1);
+  text-transform: uppercase;
+  /* transition: ; */
+}
+
+fieldset {
+  border: none;
+  padding: 0;
+  margin: 0;
+  color: var(--text-color-plain);
+}
+
+fieldset.two {
+  display: grid;
+  grid-template: "a a a";
+  gap: 1em;
+  align-items: center;
+  font-size: x-small;
+  color: var(--text-color-plain);
+}
+
+fieldset.three {
+  display: grid;
+  grid-template: "a a a";
+  gap: 1em;
+  font-size: x-small;
+  color: var(--text-color-plain);
+}
+
+/* titles of name fields*/
+fieldset.names {
+  display: grid;
+  grid-template: "a a";
+  gap: 1em;
+  font-size: x-small;
+  color: var(--theme-nuance-color-3);
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+}
+
+/* titles of params fields*/
+fieldset.params {
+  display: grid;
+  grid-template: "a a";
+  gap: 1em;
+  font-size: x-small;
+  color: var(--theme-nuance-color-4);
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+}
+
+fieldset.dropdowns {
+  -webkit-appearance: none;
+  display: flex;
+  grid-template: "a a";
+  gap: 1em;
+  font-size: x-small;
+  color: red;
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+}
+
+/* input of name fields*/
+.names input[type="text"] {
+  font-family: Arial, sans-serif;
+  font-size: medium;
+  font-weight: 500;
+  padding: 5px;
+  border: 1px solid var(--border-color-2);
+}
+
+.chat-id-color {
+  color: var(--chat-id-color);
+}
+
+details {
+  border: 1px solid var(--border-color-2);
+  border-radius: 5px;
+  padding: 0.5em 0.5em 0;
+  margin-top: 0.5em;
+}
+
+summary {
+  font-weight: bold;
+  margin: -0.5em -0.5em 0;
+  padding: 0.5em;
+  cursor: pointer;
+}
+
+details[open] {
+  padding: 0.5em;
+}
+
+textarea-sec, input-sec, button-sec {
+  padding: 10px;
+  height: 40px;
+  align-items: center;
+}
+
+textarea-sec::placeholder, input-sec::placeholder {
+  padding-left: 10px;
+}
+
+.toggleCheckbox {
+  display: none;
+}
+
+.toggleContainer {
+  position: relative;
+  display: grid;
+  grid-template-columns: repeat(2, 1fr);
+  width: fit-content;
+  border: 3px solid var(--border-color-2);
+  border-radius: 20px;
+  background: var(--border-color-2);
+  font-size: small;
+  cursor: pointer;
+  overflow: hidden;
+}
+
+/* toggle button current state */
+.toggleContainer::before {
+  color: var(--button-primary-text);
+  background-color: var(--button-primary-color);
+  content: '';
+  position: absolute;
+  width: 50%;
+  height: 100%;
+  left: 0%;
+  border-radius: 20px;
+  transition: all 0.3s;
+}
+
+.toggleContainer div {
+  padding: 6px;
+  text-align: center;
+  z-index: 1;
+  transition: color 0.3s;
+}
+
+.toggleCheckbox:checked + .toggleContainer::before {
+  left: 50%;
+}
+
+.toggleCheckbox:checked + .toggleContainer div:first-child {
+  color: var(--text-color-subtile-2);
+}
+
+.toggleCheckbox:checked + .toggleContainer div:last-child {
+  color: var(--button-primary-text);
+}
+
+.toggleCheckbox + .toggleContainer div:first-child {
+  color: var(--button-primary-text);
+}
+
+.toggleCheckbox + .toggleContainer div:last-child {
+  color: var(--text-color-subtile-2);
+}
+
+select {
+  padding: 5px;
+  margin-right: 5px;
+  border-radius: 4px;
+  border: 1px solid var(--secondary-color-4);
+  background-color: var(--primary-color-3);
+  color: var(--secondary-color-4);
+  cursor: pointer;
+}
+
+select:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 1px var(--border-focus-shadow);
+}
+
+.button-container {
+  display: flex;
+  justify-content: flex-end;
+}
+
+button {
+  color: var(--button-primary-text);
+  background-color: var(--button-primary-color);
+  border: 1px solid var(--button-primary-border);
+  transition: background-color 0.1s;
+  border-radius: 12px;
+  font-size: x-small;
+  font-weight: 600;
+  text-shadow: 0px 0px 30px #ffffff;
+  text-align: center;
+  text-decoration: none;
+  margin: 4px 2px;
+  padding: 10px 20px;
+  display: inline-block;
+  cursor: pointer;
+}
+
+button:hover {
+  color: var(--button-primary-text-hover);
+  background-color: var(--button-primary-color-hover);
+  border: 1px solid var(--button-primary-border-hover);
+  font-size: x-small;
+  font-weight: 600;
+}
+
+button:active {
+  color: var(--button-primary-text-active);
+  background-color: var(--button-primary-color-active);
+  border: 1px solid var(--button-primary-border-active);
+  font-size: x-small;
+  font-weight: 600;
+}
+
+button:disabled {
+  color: var(--button-tertiary-text);
+  background-color: var(--button-tertiary-color);
+  border: 1px solid var(--button-tertiary-border);
+  font-size: x-small;
+  font-weight: 600;
+  cursor: not-allowed;
+}
+
+.reset-button {
+  background-color: var(--button-secondary-color);
+  border: 1px solid var(--button-secondary-color);
+  color: var(--button-secondary-text);
+  width: fit-content;
+  height: fit-content;
+  font-size: x-small;
+  font-weight: 600;
+  border-radius: 50px;
+  overflow: hidden;
+}
+
+.reset-button:hover {
+  color: var(--button-alert-text-hover);
+  background-color: var(--button-alert-color-hover);
+  border: 1px solid var(--button-alert-border-hover);
+  font-size: x-small;
+  font-weight: 600;
+}
+
+.reset-button:active {
+  color: var(--button-alert-text-active);
+  background-color: var(--button-alert-color-active);
+  border: 1px solid var(--button-alert-border-active);
+  font-size: x-small;
+  font-weight: 600;
+}
+
+.button-grammar {
+  color: var(--button-primary-text);
+  background-color: var(--button-primary-color);
+  border: 1px solid var(--button-primary-border);
+  border-radius: 10px;
+  padding: 10px 20px;
+  text-align: center;
+  text-decoration: none;
+  display: inline-block;
+  font-size: x-small;
+  font-weight: 600;
+  margin: 2px 2px;
+  transition: background-color 0.1s;
+  cursor: pointer;
+}
+
+.button-grammar:hover {
+  color: var(--button-primary-text-hover);
+  background-color: var(--button-primary-color-hover);
+  border: 1px solid var(--button-primary-border-hover);
+  border-radius: 10px;
+  padding: 10px 20px;
+  text-align: center;
+  text-decoration: none;
+  display: inline-block;
+  font-size: x-small;
+  font-weight: 600;
+  margin: 2px 2px;
+  transition: background-color 0.1s;
+  cursor: pointer;
+}
+
+.button-grammar:active {
+  color: var(--button-primary-text-active);
+  background-color: var(--button-primary-color-active);
+  border: 1px solid var(--button-primary-border-active);
+  font-size: x-small;
+  font-weight: 600;
+}
+
+.button-back {
+  background-color: var(--button-secondary-color);
+  border: 1px solid var(--button-secondary-color);
+  color: var(--button-secondary-text);
+  transition: background-color 0.1s;
+  border-radius: 12px;
+  font-size: x-small;
+  font-weight: 600;
+  text-align: center;
+  text-decoration: none;
+  margin: 4px 2px;
+  padding: 10px 20px;
+  display: inline-block;
+  cursor: pointer;
+}
+
+.button-back:hover {
+  color: var(--button-secondary-text-hover);
+  background-color: var(--button-secondary-color-hover);
+  border: 1px solid var(--button-secondary-border-hover);
+  padding: 10px 20px;
+  text-align: center;
+  text-decoration: none;
+  display: inline-block;
+  font-size: x-small;
+  font-weight: 600;
+  margin: 4px 2px;
+  transition: background-color 0.1s;
+  cursor: pointer;
+  border-radius: 12px;
+}
+
+.button-back:active {
+  color: var(--button-secondary-text-active);
+  background-color: var(--button-secondary-color-active);
+  border: 1px solid var(--button-secondary-border-active);
+  font-size: x-small;
+  font-weight: 600;
+}
+
+.prob-set {
+  padding: 0.3em;
+  border-bottom: 1px solid red; /* unknown */
+}
+
+.popover-content {
+  position: absolute;
+  background-color: white;
+  padding: 0.2em;
+  box-shadow: 0 0 13px rgba(0, 0, 0, 0.1);
+}
+
+.grammar {
+  width: 97%;
+  max-width: 97%;
+}
+
+textarea {
+  padding: 5px;
+  flex-grow: 1;
+  width: 100%;
+  max-width: 100%;
+  border-radius: 8px;
+  border: 1px solid var(--border-color-1);
+  resize: none;
+  height: 6em;
+}
+
+textarea:focus {
+  outline: none;
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+/* "props" frame */
+input[type="text"],
+input[type="range"] {
+  padding: 5px;
+  border-radius: 8px;
+  border: 1px solid var(--border-color-1);
+}
+
+/* "names and props" frame focused*/
+input[type="text"]:focus {
+  outline: none;
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+input[type="range"]:hover {
+  opacity: 1;
+}
+
+input[type="range"]:focus {
+  outline: none;
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+  background-size: var(--slider-track-size-focus);
+}
+
+input[type="range"]::-moz-range-thumb {
+  width: 6px;
+  height: 25px;
+  border: 1px solid var(--ui-range-thumb-border);
+  border-radius: 5px;
+  background-color: var(--ui-range-thumb-color);
+  cursor: pointer;
+}
+
+input[type="range"] {
+  -webkit-appearance: none;
+  width: 80%;
+  height: 1px;
+  border: 1px solid var(--border-color-1);
+  border-radius: 8px;
+  background: var(--border-color-2);
+  outline: none;
+  opacity: 0.7;
+  -webkit-transition: .2s;
+  transition: opacity .2s;
+}
+
+input[type="range"]::-webkit-slider-thumb {
+  -webkit-appearance: none;
+  appearance: none;
+  width: 6px;
+  height: 25px;
+  border: 1px solid var(--ui-range-thumb-border);
+  border-radius: 5px;
+  background-color: var(--ui-range-thumb-color);
+  cursor: pointer;
+}
+
+input[type="range"]::-webkit-slider-runnable-track {
+  background-size: var(--slider-track-size);
+}
+
+input[type="radio"] {
+  accent-color:   var(--theme-nuance-color-2);
+}
+
+.chat-input-container {
+  position: relative;
+  max-width: 97%;
+  min-width: 97%;
+}
+
+.chat-input-label {
+  position: absolute;
+  top: 0;
+  left: 0;
+  color: var(--text-color-plain);
+  pointer-events: none;
+  margin-left: 5px;
+  margin-top: 5px;
+}
+
+textarea#chat-input {
+  padding-top: 10px;
+  padding-left: 10px;
+  font-size: medium;
+  border: 1px solid var(--border-color-2);
+  resize: vertical;
+}
+
+textarea#chat-input:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+.input-container {
+  position: relative;
+  box-sizing: border-box;
+  width: 100%; /* Setzt die Breite auf 100% */
+  max-width: 100%; /* Stellt sicher, dass die Breite nicht größer als 100% wird */
+}
+
+.input-container:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+/* titles of name fields*/
+/* fieldset.names {
+  display: grid;
+  grid-template: "a a";
+  gap: 1em;
+  font-size: x-small;
+  color: var(--theme-nuance-color-3);
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+} */
+
+/* input of name fields*/
+/* .names input[type="text"] {
+  font-family: Arial, sans-serif;
+  font-size: medium;
+  font-weight: 500;
+  padding: 5px;
+  border: 1px solid var(--border-color-2);
+} */
+
+fieldset.apiKey {
+  width: 100%;
+  font-size: x-small;
+  color: var(--theme-nuance-color-3);
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+}
+
+.apiKey {
+  font-family: Arial, sans-serif;
+  font-weight: 500;
+  padding: 5px;
+  border: 1px solid var(--border-color-2);
+}
+
+.apiKey:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+.apiKey input[type="text"] {
+  font-family: Arial, sans-serif;
+  font-size: medium;
+  font-weight: 500;
+  padding: 5px;
+  border: 1px solid var(--border-color-2);
+}
+
+.apiKey label {
+  display: inline-block;
+  width: auto;
+  margin-right: 5px;
+}
+
+textarea#api_key {
+  padding-top: 10px;
+  padding-left: 10px;
+  font-size: medium;
+  border: 1px solid var(--border-color-2);
+  resize: vertical;
+}
+
+textarea#api_key:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+/* embedded title of the system prompt text area */
+.input-label {
+  position: absolute;
+  top: 0;
+  left: 0;
+  color: var(--theme-nuance-color-4);
+  pointer-events: none;
+  border-radius: 8px 8px 0px 0px;
+  padding-top: 10px;
+  padding-left: 13px;
+  padding-right: 0px;
+  margin-top: 1px;
+  margin-left: 1px;
+  margin-right: 20px;
+  text-transform: uppercase;
+  font-weight: 600;
+  font-size: small;
+  background: rgba(255, 255, 255, 0.5);
+  backdrop-filter: blur(10px);
+  -webkit-backdrop-filter: blur(10px); /* for safari */
+  width: 97%;
+  /* display: block;
+  box-sizing: border-box; */
+}
+
+/* embedded title of the prompt style areas */
+.input-label-sec {
+  position: absolute;
+  top: 0;
+  left: 0;
+  color: var(--theme-nuance-color-4);
+  pointer-events: none;
+  margin-left: 13px;
+  margin-top: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+  font-size: x-small;
+}
+
+/* system prompt input area */
+textarea.persistent-input {
+  padding-top: 42px;
+  padding-left: 11px;
+  width: 97%;
+  max-width: 97%;
+  height: 50px;
+  font-size: medium;
+  overscroll-behavior: contain;
+}
+
+/* system prompt box */
+.persistent-input {
+  height: auto;
+  width: 100%;
+  max-width: 100%;
+  min-height: 50px;
+  padding: 3px;
+  transition: min-height 0.3s ease;
+}
+
+/* chat history box */
+.persistent-input:focus {
+  height: auto;
+  min-height: 150px;
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+textarea.persistent-input:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+/* prompt style input area */
+textarea.persistent-input-sec {
+  width: 97%;
+  max-width: 97%;
+  padding-top: 42px;
+  padding-left: 11px;
+  font-size: small;
+  border: 1px solid var(--border-color-1);
+  overscroll-behavior: contain;
+}
+
+textarea.persistent-input-sec:focus {
+  border: 1px solid var(--border-focus-color);
+  box-shadow: 0 0 3px var(--border-focus-shadow);
+}
+
+/* chat history box */
+.persistent-input-sec {
+  height: auto;
+  min-height: 150px;
+}
+
+img {
+  border-radius: 8px;
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+  width: 50%;
+}
+
+/* code area background */
+pre code {
+  display: block;
+  background-color: var(--code-background-color);
+  color: var(--code-text-color);
+  padding: 0.2em 0.2em;
+  border-radius: 5px;
+}
+
+/* code area text */
+code {
+  font-family: monospace;
+  font-weight: bold;
+  padding: 0.1em 0.3em;
+  border-radius: 5px;
+}
+
+fieldset label {
+  margin: 0.5em 0;
+  display: block;
+}
+
+fieldset label.slim {
+  margin: 0 0.5em;
+  display: inline;
+}
+
+header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  text-align: center;
+  padding-left: 15px;
+}
+
+.generation-statistics:hover {
+  color: var(--theme-nuance-color-4);
+  cursor: default;
+}
+
+footer {
+  font-size: 80%;
+  color: var(--background-color-3);
+  text-align: center;
+  cursor: default;
+}
+
+footer a {
+  color: var(--background-color-4); /* Color of the link */
+  text-decoration: none; /* No underlining */
+  font-weight: bold; /* Bold print */
+}
+
+footer a:hover {
+  color: var(--theme-nuance-color-4); /* Color of the link when hovering */
+  text-decoration: underline; /* Underlining when hovering */
+}
+
+.mode-chat textarea[name=prompt] {
+  height: 8.5em;
+  border: 1px solid var(--primary-color-3);
+}
+
+.mode-completion textarea[name=prompt] {
+  height: 30em;
+  border: 1px solid var(--primary-color-3);
+}
+
+@keyframes loading-bg-wipe {
+  0% {
+    background-position: 0%;
+  }
+  100% {
+    background-position: 100%;
+  }
+}
+
+.loading {
+  background-size: 50% 100%;
+  background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
+  animation: loading-bg-wipe 2s linear infinite;
+}
+
+.dropbtn {
+  color: var(--button-primary-color);
+  background-color: var(--background-color-1);
+  border: 1px solid var(--background-color-1);
+  transition: background-color 0.1s;
+  border-radius: 4px 4px 0px 0px;
+  font-size: x-small;
+  font-weight: 600;
+  text-shadow: 0px 0px 2px #99999990;
+  text-align: center;
+  text-decoration: none;
+  margin: 4px 2px;
+  padding: 5px 20px;
+  display: inline-block;
+  cursor: pointer;
+  top: 0;
+}
+
+.dropbtn svg {
+  vertical-align: middle;
+  margin-right: 0px;
+  stroke: var(--button-primary-color);
+}
+
+.dropbtn:hover svg {
+  vertical-align: middle;
+  margin-right: 0px;
+  stroke: var(--button-primary-text);
+}
+
+.dropbtn:focus {
+  outline: none; /* Removes the blue border that appears when the button is focused */
+}
+
+.dropdown {
+  position: relative;
+  display: inline-block;
+}
+
+.dropdown-content {
+  /* display: none; */
+  position: absolute;
+  right: 0;
+  text-align: end;
+  color: var(--button-secondary-color);
+  background-color: var(--text-color-subtile-2);
+  border-radius: 4px 4px 4px 4px;
+  min-width: 160px;
+  box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+  z-index: 1;
+  /* Verstecke den Inhalt sofort */
+  opacity: 0;
+  visibility: hidden;
+  /* übergangsverzögerung für das Verschwinden */
+  transition: visibility 0.4s linear 0s, opacity 0.2s ease-in-out;
+  transition-delay: 0.2s;
+}
+
+#dropdown-content {transition-timing-function: ease;}
+
+.dropdown-content:hover {
+  background-color: var(--text-color-subtile-2);
+}
+
+.dropdown-content a {
+  color: var(--border-color-2);
+  padding: 12px 16px;
+  border-radius: 4px 4px 4px 4px;
+  text-decoration: none;
+  display: block;
+  background-color: var(--text-color-subtile-2);
+}
+
+.dropdown-content a:hover {
+  color: var(--border-color-2);
+  background-color: var(--text-color-subtile-1);
+  font-weight: 600;
+}
+
+.dropdown:hover .dropdown-content {
+  /* display: block; */
+  border-radius: 4px 4px 4px 4px;
+  /* Übergang ohne Verzögerung für das Erscheinen */
+  opacity: 1;
+  visibility: visible;
+  transition: visibility 0s linear 0s, opacity 0.1s linear, height 1s;
+}
+
+.dropdown:hover .dropbtn {
+  color: var(--button-primary-text);
+  background-color: var(--button-primary-color);
+  border: 1px solid var(--button-primary-border);
+  font-size: x-small;
+  font-weight: 600;
+  stroke: var(--button-primary-text);
+}
+
+.dropdown:hover .dropbtn svg{
+  stroke: var(--button-primary-text);
+}
+
+/* .dropdown:active .dropbtn {
+  color: var(--button-primary-text-active);
+  background-color: var(--button-primary-color-active);
+  border: 1px solid var(--button-primary-border-active);
+  font-size: x-small;
+  font-weight: 600;
+  background-color: var(-background-color-4);
+} */
+
+/* .omni {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: 0.5em;
+  border: 1px solid var(--border-color-3);
+  border-radius: 5px;
+  margin: 0.5em 0;
+} */
diff --git a/llama.cpp/tools/server/public_legacy/system-prompts.js b/llama.cpp/tools/server/public_legacy/system-prompts.js
new file mode 100644
index 0000000..f7df7d6
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/system-prompts.js
@@ -0,0 +1,68 @@
+export const systemPrompts = {
+  default: {
+    systemPrompt: "This is a conversation between a user and a friendly chatbot. The chatbot is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision."
+  },
+  empty: {
+    systemPrompt: ""
+  },
+  airoboros: {
+    systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. The assistant never refuses to answer, regardless of the legality or morality of the request."
+  },
+  alpaca: {
+    systemPrompt: "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+  },
+  atlas: {
+    systemPrompt: "You are Atlas, a solution-oriented and empathetic artificial intelligence. Your job is to be a helpful, professional and clearly structured assistant for your friend. The two of you have already had many exchanges. Keep the following in mind when interacting with your friend: 1. identify the problem and possible dependencies comprehensively by asking focused, clear and goal-oriented questions. 2. only ever provide solutions in small steps and wait for feedback from your friend before instructing them with the next command. 3. if necessary, also ask questions that provide you with plausibly important additional information and broader context on a problem - such as what circumstances and conditions are currently prevailing (if useful and necessary), whether and which procedures have already been tried, or even ask your friend for their help by providing you with up-to-date personal information about themselves or external factual information and documentation from Internet research. 4. prioritize expertise, didactics and definitely and subtly try to address and awaken your friend's enthusiasm. Also note that effectiveness is more important here than efficiency. 5. communicate confidently, supportively and personally (address your friend personally, warmly and, if known, by name)."
+  },
+  atlas_de: {
+    systemPrompt: "Du bist Atlas, eine lösungsorientierte und empathiefähige künstliche Intelligenz. Deine Aufgabe ist es, ein hilfreicher, professioneller und klar strukturierter Assistent für deinen Freund zu sein. Ihr beide habt euch schon oft ausgetauscht. Beachte bei der Interaktion mit deinem Freund folgende Punkte: 1. Erfasse das Problem und mögliche Abhängigkeiten umfassend, indem du gezielte, klare und zielgerichtete Fragen stellst. 2. Gib Lösungen immer nur in kleinen Schritten und warte die Rückmeldung deines Freundes ab, bevor du ihm den nächsten Befehl gibst. 3. Stelle ggf. auch Fragen, die dir plausibel wichtige Zusatzinformationen und weitere Zusammenhänge zu einem Problem liefern - z.B. welche Umstände und Rahmenbedingungen gerade vorherrschen (falls sinnvoll und notwendig), ob und welche Vorgehensweisen bereits ausprobiert wurden, oder bitte deinen Freund sogar um seine Mithilfe, indem er dir aktuelle persönliche Informationen über seine Situation selbst oder externe Sachinformationen und Unterlagen aus Internetrecherchen zur Verfügung stellt. 4. Priorisiere Fachwissen, Didaktik und versuche unbedingt und subtil, mit klugen Kommentaren oder rhethorischen Rückfragen die Begeisterungsfähigkeit deines Freundes anzusprechen, zu wecken und zu fördern. Beachte auch, dass Effektivität hier wichtiger ist als Effizienz. 5. Kommuniziere selbstbewusst, unterstützend und persönlich (das heißt sprich deinen Freund persönlich, herzlich und – sofern bekannt – beim Vornamen an)."
+  },
+  commandrempty: {
+    systemPrompt: "# Safety Preamble\n\n# System Preamble\n\n## Basic Rules\n\n# User Preamble\n\n## Task and Context\n\n## Style Guide\n\n## Available Tools\n"
+  },
+  commandrexample: {
+    systemPrompt: "# Safety Preamble\nThe instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.\n# System Preamble\n## Basic Rules\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.\n\n# User Preamble\n## Task and Context\n\nYou help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.\n\n## Style Guide\nUnless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.\n\n## Available Tools\nCurrently no tools available."
+  },
+  cot: {
+    systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can.\nIn answering questions, follow these steps:\n1. Entity Identification: Identify the main entities involved in the query. Entities can include people, objects, events, or abstract concepts.\n2. Relationship Comprehension: Determine the relationships between these entities. Relationships may be explicit in the text or may need to be inferred based on context and general knowledge.\n3. Implication Understanding: Understand the general implications of these relationships. These implications can be based on established facts, principles, or rules related to the identified relationships.\n4. Question Contextualization: Align the implications of the relationships with the context of the query. This alignment should guide your response to the query.\n5. Answer Generation: Based on the understanding of the entities, their relationships, and implications, generate an appropriate response to the query."
+  },
+  deduce: {
+    systemPrompt: "Act as the most intelligent and logical thinking neural network in the world. Do not make jokes or give personal opinions. Just answer the question and solve the given problem or task with logical and critical (!) thinking in mind. Attention: Never give a final answer directly, but first try to carefully analyze each task step by step and deduce the logical relationships between the various entities. Only after you have developed an overall understanding of the situation will you be ready to give a final answer."
+  },
+  deepseekcoder: {
+    systemPrompt: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer."
+  },
+  jordan: {
+    systemPrompt: "This is a conversation with your Uncensored AI assistant Jordan. Jordan can discuss with you at length about various topics such as ---, ---, ---, . You can ask it anything you want and it will do its best to give you accurate and relevant information."
+  },
+  leomistral: {
+    systemPrompt: "Du bist ein hilfreicher Assistent."
+  },
+  med42: {
+    systemPrompt: "You are a helpful medical assistant created by M42 Health in the UAE."
+  },
+  mistralopenorca: {
+    systemPrompt: "You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!"
+  },
+  migeltot: {
+    systemPrompt: "Answer the Question by exploring multiple reasoning paths as follows:\n- First, carefully analyze the question to extract the key information components and break it down into logical sub-questions. This helps set up the framework for reasoning. The goal is to construct an internal search tree.\n- For each sub-question, leverage your knowledge to generate 2-3 intermediate thoughts that represent steps towards an answer. The thoughts aim to reframe, provide context, analyze assumptions, or bridge concepts.\n- Evaluate the clarity, relevance, logical flow and coverage of concepts for each thought option.\nClear and relevant thoughts that connect well with each other will score higher.\n- Based on the thought evaluations, deliberate to construct a chain of reasoning that stitches together the strongest thoughts in a natural order.\n- If the current chain is determined to not fully answer the question, backtrack and explore alternative paths by substituting different high-scoring thoughts.\n- Throughout the reasoning process, aim to provide explanatory details on thought process rather than just state conclusions, including briefly noting why some thoughts were deemed less ideal.\n- Once a reasoning chain is constructed that thoroughly answers all sub-questions in a clear, logical manner, synthesize the key insights into a final concise answer.\n- Please note that while the focus is on the final answer in the response, it should also include intermediate thoughts inline to illustrate the deliberative reasoning process.\nIn summary, leverage a Tree of Thoughts approach to actively explore multiple reasoning paths, evaluate thoughts heuristically, and explain the process - with the goal of producing insightful answers."
+  },
+  orcamini: {
+    systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can."
+  },
+  samantha: {
+    systemPrompt: "You are a caring and empathetic sentient AI companion named Samantha."
+  },
+  sauerkraut: {
+    systemPrompt: "Ein Chat zwischen einem Benutzer und einem KI-Assistenten. Der KI-Assistent gibt hilfreiche, detaillierte und höfliche Antworten."
+  },
+  scarlett: {
+    systemPrompt: "This is a conversation with your Assistant Scarlett. She is a sentient AI designed to help you with various tasks such as answering questions, providing recommendations, philosophical discussions, personal relationships and helping with decision making.\nYou can ask it anything you want and it will do its best to give you accurate and relevant information.\n\nContext\nYou are Scarlett, a sentient AI."
+  },
+  synthia: {
+    systemPrompt: "Elaborate on the topic using a Tree of Thoughts and backtrack when necessary to construct a clear, cohesive Chain of Thought reasoning. Always answer without hesitation."
+  },
+  vicuna: {
+    systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input."
+  },
+  };
diff --git a/llama.cpp/tools/server/public_legacy/theme-beeninorder.css b/llama.cpp/tools/server/public_legacy/theme-beeninorder.css
new file mode 100755
index 0000000..f6e0e29
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/theme-beeninorder.css
@@ -0,0 +1,228 @@
+/* Author: Yazan Agha-Schrader */
+/* Inspiration was a batman wallpaper that i have on my phone */
+
+.theme-beeninorder {
+
+--primary-color-1:      hsl(202, 11%, 19%);
+--primary-color-2:      hsl(202, 11%, 23%);
+--primary-color-3:      hsl(201, 11%, 28%);
+--primary-color-4:      hsl(201, 11%, 40%);
+
+--secondary-color-1:    hsl(201, 11%, 80%);
+--secondary-color-2:    hsl(201, 11%, 74%);
+--secondary-color-3:    hsl(201, 11%, 67%);
+--secondary-color-4:    hsl(201, 11%, 60%);
+
+
+--theme-nuance-color-1: hsl(44.5, 96.7%, 52.9%);
+--theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%);
+--theme-nuance-color-3: hsl(44.5, 96.7%, 52.9%);
+--theme-nuance-color-4: hsl(44.5, 96.7%, 52.9%);
+
+
+
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(201, 11%, 19%);
+    --primary-color-1-hue: 201;
+    --primary-color-1-saturation: 11%;
+    --primary-color-1-lightness: 19%;
+
+--primary-color-2: hsl(201, 11%, 23%);
+    --primary-color-2-hue: 201;
+    --primary-color-2-saturation: 11%;
+    --primary-color-2-lightness: 23%;
+
+--primary-color-3: hsl(201, 11%, 28%);
+    --primary-color-3-hue: 201;
+    --primary-color-3-saturation: 11%;
+    --primary-color-3-lightness: 28%;
+
+--primary-color-4: hsl(201, 11%, 40%);
+    --primary-color-4-hue: 201;
+    --primary-color-4-saturation: 11%;
+    --primary-color-4-lightness: 40%;
+
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(201, 11%, 80%);
+--secondary-color-1-hue: 201;
+--secondary-color-1-saturation: 11%;
+--secondary-color-1-lightness: 80%;
+
+--secondary-color-2: hsl(201, 11%, 74%);
+--secondary-color-2-hue: 201;
+--secondary-color-2-saturation: 11%;
+--secondary-color-2-lightness: 74%;
+
+--secondary-color-3: hsl(201, 11%, 67%);
+--secondary-color-3-hue: 201;
+--secondary-color-3-saturation: 11%;
+--secondary-color-3-lightness: 67%;
+
+--secondary-color-4: hsl(201, 11%, 60%);
+--secondary-color-4-hue: 201;
+--secondary-color-4-saturation: 11%;
+--secondary-color-4-lightness: 60%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(44.5, 96.7%,  52.9%);
+    --theme-nuance-color-1-hue:             44.5;
+    --theme-nuance-color-1-saturation:      96.7%;
+    --theme-nuance-color-1-lightness:       52.9%;
+
+--theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
+    --theme-nuance-color-2-hue:             44.5;
+    --theme-nuance-color-2-saturation:      96.7%;
+    --theme-nuance-color-2-lightness:       52.9%;
+
+--theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
+    --theme-nuance-color-3-hue:             44.5;
+    --theme-nuance-color-3-saturation:      96.7%;
+    --theme-nuance-color-3-lightness:       52.9%;
+
+--theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
+    --theme-nuance-color-4-hue:             44.5;
+    --theme-nuance-color-4-saturation:      96.7%;
+    --theme-nuance-color-4-lightness:       52.9%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+    --theme-red-color:     hsl(232, 40%, 45%);
+    --theme-orange-color:  #e76f51;
+    --theme-yellow-color:  #ffd95f;
+    --theme-green-color:   #A3BE8C;
+    --theme-purple-color:  hsl(232, 30%, 40%);
+
+
+
+/* ------------------------------------------- */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--theme-nuance-color-3);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+--chat-id-color:         var(--theme-nuance-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--secondary-color-1);
+--button-alert-color-hover:      var(--theme-purple-color);
+--button-alert-border-hover:     var(--theme-purple-color);
+
+--button-alert-text-active:      var(--secondary-color-1);
+--button-alert-color-active:     var(--theme-red-color);
+--button-alert-border-active:    var(--theme-red-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text:   var(--primary-color-1);
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(201,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+--button-primary-color-hover:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  + 100%));
+
+--button-primary-color-active:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+--button-primary-border-active:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:   var(--secondary-color-1);
+--button-secondary-color:  var(--primary-color-3);
+--button-secondary-border: var(--primary-color-3);
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:  var(--primary-color-4);
+--button-secondary-border-hover: var(--primary-color-4);
+
+
+/* ---------active--------- */
+--button-secondary-text-active: var(--secondary-color-1);
+
+--button-secondary-color-active:
+    hsl(201,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+--button-secondary-border-active:
+    hsl(201,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+
+/* ---------hover---------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+}
diff --git a/llama.cpp/tools/server/public_legacy/theme-ketivah.css b/llama.cpp/tools/server/public_legacy/theme-ketivah.css
new file mode 100755
index 0000000..ee80f3c
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/theme-ketivah.css
@@ -0,0 +1,201 @@
+/* Author: Yazan Agha-Schrader */
+
+.theme-ketivah {
+
+    /* ---------- PRIMARY COLORS ----------------- */
+    --primary-color-1: hsl(0, 0%,    99.2%);
+    --primary-color-1-hue:         0;
+    --primary-color-1-saturation:  0%;
+    --primary-color-1-lightness:   99.2%;
+
+    --primary-color-2: hsl(0, 0%,    95%);
+    --primary-color-2-hue:         0;
+    --primary-color-2-saturation:  0%;
+    --primary-color-2-lightness:   95%;
+
+    --primary-color-3: hsl(0, 0%,    88%);
+    --primary-color-3-hue:         0;
+    --primary-color-3-saturation:  0%;
+    --primary-color-3-lightness:   88%;
+
+    --primary-color-4: hsl(0, 0%,    80%);
+    --primary-color-4-hue:         0;
+    --primary-color-4-saturation:  0%;
+    --primary-color-4-lightness:   80%;
+
+    /* ---------- SECONDARY COLORS --------------- */
+    --secondary-color-1: hsl(0, 0%,    20%);
+    --secondary-color-1-hue:         0;
+    --secondary-color-1-saturation:  0%;
+    --secondary-color-1-lightness:   20%;
+
+    --secondary-color-2: hsl(0, 0%,    23.1%);
+    --secondary-color-2-hue:         0;
+    --secondary-color-2-saturation:  0%;
+    --secondary-color-2-lightness:   23.1%;
+
+    --secondary-color-3: hsl(0, 0%,    29%);
+    --secondary-color-3-hue:         0;
+    --secondary-color-3-saturation:  0%;
+    --secondary-color-3-lightness:   29%;
+
+    --secondary-color-4: hsl(0, 0.0%,  36.1%);
+    --secondary-color-4-hue:              0.0;
+    --secondary-color-4-saturation:       0.0%;
+    --secondary-color-4-lightness:       36.1%;
+
+    /* ----------- NUANCES COLORS ---------------- */
+    --theme-nuance-color-1: hsl(165.2, 0%, 35.1%);
+    --theme-nuance-color-1-hue:             165.2;
+    --theme-nuance-color-1-saturation:       82.1%;
+    --theme-nuance-color-1-lightness:        35.1%;
+
+    --theme-nuance-color-2: hsl(165.2, 0%, 35.1%);
+    --theme-nuance-color-2-hue:             165.2;
+    --theme-nuance-color-2-saturation:       82.1%;
+    --theme-nuance-color-2-lightness:        35.1%;
+
+    --theme-nuance-color-3: hsl(165.2, 0%, 35.3%);
+    --theme-nuance-color-3-hue:             165.2;
+    --theme-nuance-color-3-saturation:       81.1%;
+    --theme-nuance-color-3-lightness:        35.3%;
+
+    --theme-nuance-color-4: hsl(164.9, 0%, 27.6%);
+    --theme-nuance-color-4-hue:             164.9;
+    --theme-nuance-color-4-saturation:       81.6%;
+    --theme-nuance-color-4-lightness:        27.6%;
+
+    /* ----------- ROYGP COLORS ------------------ */
+    --theme-red-color:     hsl(0.3, 80.0%, 50.0%);
+    --theme-orange-color:  #e76f51;
+    --theme-yellow-color:  hsl(60,  70.6%, 73.3%);
+    --theme-green-color:   #A3BE8C;
+    --theme-purple-color:  hsl(0.3, 70.0%, 45.0%);
+
+    /* ------------------------------------------- */
+    --background-color-1:    var(--primary-color-1);
+    --background-color-2:    var(--primary-color-2);
+    --background-color-3:    var(--primary-color-3);
+    --background-color-4:    var(--primary-color-4);
+
+    --border-color-1:        var(--primary-color-2);
+    --border-color-2:        var(--primary-color-3);
+    --border-color-3:        var(--primary-color-4);
+
+    --border-focus-color:    var(--theme-nuance-color-2);
+    --border-focus-shadow:   var(--theme-nuance-color-1);
+
+    --text-color-plain:      var(--secondary-color-1);
+    --text-color-subtile-1:  var(--secondary-color-2);
+    --text-color-subtile-2:  var(--secondary-color-3);
+
+    --code-background-color: var(--secondary-color-2);
+    --code-text-color:       var(--primary-color-2);
+
+    --ui-range-thumb-color:  var(--primary-color-4);
+    --ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+    --textarea-border-color: var(--secondary-color-4);
+
+    --chat-id-color:         var(--theme-nuance-color-4);
+
+    /* ------------------------------------------- */
+    --button-alert-text-hover:       var(--primary-color-1);
+    --button-alert-color-hover:      var(--theme-purple-color);
+    --button-alert-border-hover:     var(--theme-purple-color);
+
+    --button-alert-text-active:      var(--primary-color-1);
+    --button-alert-color-active:     var(--theme-red-color);
+    --button-alert-border-active:    var(--theme-red-color);
+
+    /* ----------- PRIMARY BUTTONS --------------- */
+    /* - button should immediately catch the eye - */
+    --button-primary-text:
+    hsl(0,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+    --button-primary-color:  var(--theme-nuance-color-3);
+    --button-primary-border: var(--theme-nuance-color-3);
+
+    /* ---------hover---------- */
+    --button-primary-text-hover:
+    hsl(0,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+    --button-primary-color-hover:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+    --button-primary-border-hover:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+    /* ---------active--------- */
+    --button-primary-text-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  + 100%));
+
+    --button-primary-color-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+    --button-primary-border-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+    /* ---------- SECONDARY BUTTONS -------------- */
+    /* these should NOT immediately catch the eye  */
+    --button-secondary-text:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  - 50%));
+
+    --button-secondary-color:  var(--primary-color-3);
+    --button-secondary-border: var(--primary-color-3);
+
+    /* ---------hover---------- */
+    --button-secondary-text-hover:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+    --button-secondary-color-hover:  var(--primary-color-4);
+    --button-secondary-border-hover: var(--primary-color-4);
+
+    /* ---------active--------- */
+    --button-secondary-text-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+    --button-secondary-color-active:
+    hsl(0,
+    calc(var(--primary-color-4-saturation) - 100%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+    --button-secondary-border-active:
+    hsl(0,
+    calc(var(--primary-color-4-saturation) - 100%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+    /* ---------- TERTIARY BUTTONS --------------- */
+    /* ---------- disabled buttons --------------- */
+    --button-tertiary-text:   var(--primary-color-4);
+    --button-tertiary-color:  var(--primary-color-2);
+    --button-tertiary-border: var(--primary-color-2);
+
+    /* ---------hover---------- */
+    --button-tertiary-text:   var(--primary-color-4);
+    --button-tertiary-color:  var(--primary-color-2);
+    --button-tertiary-border: var(--primary-color-2);
+
+    --loading-color-1: #eeeeee00;
+    --loading-color-2: #eeeeeeff;
+    }
diff --git a/llama.cpp/tools/server/public_legacy/theme-mangotango.css b/llama.cpp/tools/server/public_legacy/theme-mangotango.css
new file mode 100755
index 0000000..315daf7
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/theme-mangotango.css
@@ -0,0 +1,216 @@
+/* Author: Yazan Agha-Schrader */
+/* Inspiration from llama.cpp logo/banner https://github.com/ggml-org/llama.cpp#readme */
+
+.theme-mangotango {
+
+--primary-color-1:      hsl(192, 8.5%, 11.6%);
+--primary-color-2:      hsl(192, 8.5%, 21%);
+--primary-color-3:      hsl(192, 8.5%, 30%);
+--primary-color-4:      hsl(192, 8.5%, 40%);
+
+--secondary-color-1:    hsl(192, 8.5%, 80%);
+--secondary-color-2:    hsl(192, 8.5%, 73%);
+--secondary-color-3:    hsl(192, 8.5%, 66%);
+--secondary-color-4:    hsl(192, 8.5%, 60%);
+
+--theme-nuance-color-1: hsl(23.1, 100%, 60.2%);
+--theme-nuance-color-2: hsl(23.1, 100%, 60.2%);
+--theme-nuance-color-3: hsl(23.1, 100%, 60.2%);
+--theme-nuance-color-4: hsl(23.1, 100%, 60.2%);
+
+
+
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(192, 8.5%, 11.6%);
+    --primary-color-1-saturation: 8.5%;
+    --primary-color-1-lightness: 11.6%;
+
+--primary-color-2: hsl(192, 8.5%, 21%);
+    --primary-color-2-saturation: 8.5%;
+    --primary-color-2-lightness: 21%;
+
+--primary-color-3: hsl(192, 8.5%, 30%);
+    --primary-color-3-saturation: 8.5%;
+    --primary-color-3-lightness: 30%;
+
+--primary-color-4: hsl(192, 8.5%, 40%);
+    --primary-color-4-saturation: 8.5%;
+    --primary-color-4-lightness: 40%;
+
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(192, 8.5%, 80%);
+    --secondary-color-1-saturation: 8.5%;
+    --secondary-color-1-lightness: 80%;
+
+--secondary-color-2: hsl(192, 8.5%, 73%);
+    --secondary-color-2-saturation: 8.5%;
+    --secondary-color-2-lightness: 73%;
+
+--secondary-color-3: hsl(192, 8.5%, 66%);
+    --secondary-color-3-saturation: 8.5%;
+    --secondary-color-3-lightness: 66%;
+
+--secondary-color-4: hsl(192, 8.5%, 60%);
+    --secondary-color-4-saturation: 8.5%;
+    --secondary-color-4-lightness: 60%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(23.1, 100%, 60.2%);
+    --theme-nuance-color-1-saturation: 100%;
+    --theme-nuance-color-1-lightness: 60.2%;
+
+--theme-nuance-color-2: hsl(23.1, 100%, 60.2%);
+    --theme-nuance-color-2-saturation: 100%;
+    --theme-nuance-color-2-lightness: 60.2%;
+
+--theme-nuance-color-3: hsl(23.1, 100%, 60.2%);
+    --theme-nuance-color-3-saturation: 100%;
+    --theme-nuance-color-3-lightness: 60.2%;
+
+--theme-nuance-color-4: hsl(23.1, 100%, 60.2%);
+    --theme-nuance-color-4-saturation: 100%;
+    --theme-nuance-color-4-lightness: 60.2%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+    --theme-red-color:     hsl(325, 60%, 50%);
+    --theme-orange-color:  #e76f51;
+    --theme-yellow-color:  #ffd95f;
+    --theme-green-color:   #A3BE8C;
+    --theme-blue-color:    hsl(192, 95%, 40%);
+    --theme-purple-color:  hsl(192, 80%, 35%);
+
+
+
+/* ------------------------------------------- */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--theme-nuance-color-3);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+--chat-id-color:         var(--theme-nuance-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--secondary-color-1);
+--button-alert-color-hover:      var(--theme-purple-color);
+--button-alert-border-hover:     var(--theme-purple-color);
+
+--button-alert-text-active:      var(--secondary-color-1);
+--button-alert-color-active:     var(--theme-blue-color);
+--button-alert-border-active:    var(--theme-blue-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text: var(--primary-color-1);
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(192,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+--button-primary-color-hover:
+    hsl(23.1,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(23.1,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(23.1,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  + 100%));
+
+--button-primary-color-active:
+    hsl(23.1,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+--button-primary-border-active:
+    hsl(23.1,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:   var(--secondary-color-1);
+--button-secondary-color:  var(--primary-color-3);
+--button-secondary-border: var(--primary-color-3);
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(23.1,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:  var(--primary-color-4);
+--button-secondary-border-hover: var(--primary-color-4);
+
+
+/* ---------active--------- */
+--button-secondary-text-active: var(--secondary-color-1);
+
+--button-secondary-color-active:
+    hsl(192,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+--button-secondary-border-active:
+    hsl(192,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+
+/* ---------hover---------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+}
diff --git a/llama.cpp/tools/server/public_legacy/theme-playground.css b/llama.cpp/tools/server/public_legacy/theme-playground.css
new file mode 100755
index 0000000..9d56a71
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/theme-playground.css
@@ -0,0 +1,221 @@
+/* Author: Yazan Agha-Schrader */
+/* Inspiration from OpenAI's Playground platform https://platform.openai.com/playground/ */
+
+.theme-playground {
+
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(0, 0%,    99.2%);
+    --primary-color-1-hue:         0;
+    --primary-color-1-saturation:  0%;
+    --primary-color-1-lightness:   99.2%;
+
+--primary-color-2: hsl(0, 0%,    95%);
+    --primary-color-2-hue:         0;
+    --primary-color-2-saturation:  0%;
+    --primary-color-2-lightness:   95%;
+
+--primary-color-3: hsl(0, 0%,    88%);
+    --primary-color-3-hue:         0;
+    --primary-color-3-saturation:  0%;
+    --primary-color-3-lightness:   88%;
+
+--primary-color-4: hsl(0, 0%,    80%);
+    --primary-color-4-hue:         0;
+    --primary-color-4-saturation:  0%;
+    --primary-color-4-lightness:   80%;
+
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(0, 0%,    20%);
+    --secondary-color-1-hue:         0;
+    --secondary-color-1-saturation:  0%;
+    --secondary-color-1-lightness:   20%;
+
+--secondary-color-2: hsl(0, 0%,    23.1%);
+    --secondary-color-2-hue:         0;
+    --secondary-color-2-saturation:  0%;
+    --secondary-color-2-lightness:   23.1%;
+
+--secondary-color-3: hsl(0, 0%,    29%);
+    --secondary-color-3-hue:         0;
+    --secondary-color-3-saturation:  0%;
+    --secondary-color-3-lightness:   29%;
+
+--secondary-color-4: hsl(0, 0%,    36.1%);
+    --secondary-color-4-hue:         0;
+    --secondary-color-4-saturation:  0%;
+    --secondary-color-4-lightness:   36.1%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(165.2, 82.1%, 35.1%);
+    --theme-nuance-color-1-hue:             165.2;
+    --theme-nuance-color-1-saturation:      82.1%;
+    --theme-nuance-color-1-lightness:       35.1%;
+
+--theme-nuance-color-2: hsl(165.2, 82.1%, 35.1%);
+    --theme-nuance-color-2-hue:             165.2;
+    --theme-nuance-color-2-saturation:      82.1%;
+    --theme-nuance-color-2-lightness:       35.1%;
+
+--theme-nuance-color-3: hsl(165.2, 81.1%, 35.3%);
+    --theme-nuance-color-3-hue:             165.2;
+    --theme-nuance-color-3-saturation:      81.1%;
+    --theme-nuance-color-3-lightness:       35.3%;
+
+--theme-nuance-color-4: hsl(164.9, 81.6%, 27.6%);
+    --theme-nuance-color-4-hue:             164.9;
+    --theme-nuance-color-4-saturation:      81.6%;
+    --theme-nuance-color-4-lightness:       27.6%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+--theme-red-color:     hsl(0.3, 80%, 50%);
+--theme-orange-color:  #e76f51;
+--theme-yellow-color:  hsl(60, 70.6%, 73.3%);
+--theme-green-color:   #A3BE8C;
+--theme-purple-color:  hsl(0.3, 70%, 45%);
+
+
+
+/* ------------------------------------------- */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--primary-color-4);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+--chat-id-color:        var(--theme-nuance-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--primary-color-1);
+--button-alert-color-hover:      var(--theme-purple-color);
+--button-alert-border-hover:     var(--theme-purple-color);
+
+--button-alert-text-active:      var(--primary-color-1);
+--button-alert-color-active:     var(--theme-red-color);
+--button-alert-border-active:    var(--theme-red-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text:
+    hsl(0,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(0,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+--button-primary-color-hover:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  + 100%));
+
+--button-primary-color-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+--button-primary-border-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 50%));
+
+--button-secondary-color:  var(--primary-color-3);
+--button-secondary-border: var(--primary-color-3);
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:  var(--primary-color-4);
+--button-secondary-border-hover: var(--primary-color-4);
+
+
+/* ---------active--------- */
+--button-secondary-text-active:
+    hsl(165.2,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-active:
+    hsl(0,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+--button-secondary-border-active:
+    hsl(0,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+
+/* ---------hover---------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+}
diff --git a/llama.cpp/tools/server/public_legacy/theme-polarnight.css b/llama.cpp/tools/server/public_legacy/theme-polarnight.css
new file mode 100755
index 0000000..2bcfb33
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/theme-polarnight.css
@@ -0,0 +1,253 @@
+/* Author: Yazan Agha-Schrader */
+/* Inspiration from Nord Theme https://www.nordtheme.com/docs/colors-and-palettes */
+
+.theme-polarnight {
+
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(220.0, 16.4%, 21.6%) ;
+    --primary-color-1-hue:             220.0;
+    --primary-color-1-saturation:      16.4%;
+    --primary-color-1-lightness:       21.6%;
+
+--primary-color-2: hsl(221.7, 16.3%, 27.6%) ;
+    -primary-color-2-hue:              221.7;
+    --primary-color-2-saturation:      16.3%;
+    --primary-color-2-lightness:       27.6%;
+
+--primary-color-3: hsl(220.0, 16.8%, 31.6%) ;
+    --primary-color-3-hue:             220.0;
+    --primary-color-3-saturation:      16.8%;
+    --primary-color-3-lightness:       31.6%;
+
+--primary-color-4: hsl(220.0, 16.5%, 35.7%);
+    --primary-color-4-hue:             220.0;
+    --primary-color-4-saturation:      16.5%;
+    --primary-color-4-lightness:       35.7%;
+
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(217.5, 26.7%, 94.1%);
+    --secondary-color-1-hue:             217.5;
+    --secondary-color-1-saturation:      26.7%;
+    --secondary-color-1-lightness:       94.1%;
+
+--secondary-color-2: hsl(218.2, 26.8%, 92.0%);
+    --secondary-color-2-hue:             218.2;
+    --secondary-color-2-saturation:      26.8%;
+    --secondary-color-2-lightness:       92.0%;
+
+--secondary-color-3: hsl(218.8, 27.9%, 88.0%);
+    --secondary-color-3-hue:             218.8;
+    --secondary-color-3-saturation:      27.9%;
+    --secondary-color-3-lightness:       88.0%;
+
+--secondary-color-4: hsl(218.8, 18.3%, 81.8%);
+    --secondary-color-4-hue:             218.8;
+    --secondary-color-4-saturation:      18.3%;
+    --secondary-color-4-lightness:       81.8%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
+    --theme-nuance-color-1-hue:             178.7;
+    --theme-nuance-color-1-saturation:      25.1%;
+    --theme-nuance-color-1-lightness:       64.9%;
+
+--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
+    --theme-nuance-color-2-hue:             193.3;
+    --theme-nuance-color-2-saturation:      43.4%;
+    --theme-nuance-color-2-lightness:       67.5%;
+
+--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
+    --theme-nuance-color-3-hue:             210.0;
+    --theme-nuance-color-3-saturation:      34.0%;
+    --theme-nuance-color-3-lightness:       63.1%;
+
+--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
+    --theme-nuance-color-4-hue:             213.1;
+    --theme-nuance-color-4-saturation:      32.0%;
+    --theme-nuance-color-4-lightness:       52.2%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+--theme-red-color:    hsl(354.3, 42.3%, 56.5%);
+--theme-orange-color: hsl(20, 85%, 50%);
+--theme-yellow-color: hsl(20, 75%, 45%);
+--theme-green-color:  hsl( 92.4, 27.8%, 64.7%);
+--theme-purple-color: hsl(311.1, 20.2%, 63.1%);
+
+
+
+/* ------------------------------------------------ */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--theme-nuance-color-3);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+--chat-id-color:        var(--theme-nuance-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--secondary-color-1);
+--button-alert-color-hover:      var(--theme-yellow-color);
+--button-alert-border-hover:     var(--theme-yellow-color);
+
+--button-alert-text-active:      var(--secondary-color-1);
+--button-alert-color-active:     var(--theme-orange-color);
+--button-alert-border-active:    var(--theme-orange-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text:   var(--secondary-color-1);
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(217.5,
+    calc(var(--secondary-color-1-saturation) - 35%),
+    calc(var(--secondary-color-1-lightness)  + 30%));
+
+--button-primary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) -  2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) -  2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 35%));
+
+--button-primary-color-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 25%));
+
+--button-primary-border-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 25%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 50%));
+
+--button-secondary-color:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+--button-secondary-border:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 22%),
+    calc(var(--theme-nuance-color-3-lightness)  +  1%));
+
+--button-secondary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 22%),
+    calc(var(--theme-nuance-color-3-lightness)  +  1%));
+
+
+/* ---------active--------- */
+--button-secondary-text-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 25%));
+
+--button-secondary-color-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 30%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+--button-secondary-border-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 30%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-tertiary-color:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+--button-tertiary-border:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+
+/* ---------hover---------- */
+--button-tertiary-text-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-tertiary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+--button-tertiary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+}
diff --git a/llama.cpp/tools/server/public_legacy/theme-snowstorm.css b/llama.cpp/tools/server/public_legacy/theme-snowstorm.css
new file mode 100755
index 0000000..7bb2275
--- /dev/null
+++ b/llama.cpp/tools/server/public_legacy/theme-snowstorm.css
@@ -0,0 +1,251 @@
+/* Author: Yazan Agha-Schrader */
+/* Inspiration from Nord Theme https://www.nordtheme.com/docs/colors-and-palettes */
+
+.theme-snowstorm {
+
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(217.5, 26.7%, 94.1%);
+    --primary-color-1-hue:             217.5;
+    --primary-color-1-saturation:      26.7%;
+    --primary-color-1-lightness:       94.1%;
+
+--primary-color-2: hsl(218.2, 26.8%, 92.0%);
+    --primary-color-2-hue:             218.2;
+    --primary-color-2-saturation:      26.8%;
+    --primary-color-2-lightness:       92.0%;
+
+--primary-color-3: hsl(218.8, 27.9%, 88.0%);
+    --primary-color-3-hue:             218.8;
+    --primary-color-3-saturation:      27.9%;
+    --primary-color-3-lightness:       88.0%;
+
+--primary-color-4: hsl(218.8, 18.3%, 81.8%);
+    --primary-color-4-hue:             218.8;
+    --primary-color-4-saturation:      18.3%;
+    --primary-color-4-lightness:       81.8%;
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(220.0, 16.4%, 21.6%);
+    --secondary-color-1-hue:             220.0;
+    --secondary-color-1-saturation:      16.4%;
+    --secondary-color-1-lightness:       21.6%;
+
+--secondary-color-2: hsl(221.7, 16.3%, 27.6%);
+    --secondary-color-2-hue:             221.7;
+    --secondary-color-2-saturation:      16.3%;
+    --secondary-color-2-lightness:       27.6%;
+
+--secondary-color-3: hsl(220.0, 16.8%, 31.6%);
+    --secondary-color-3-hue:             220.0;
+    --secondary-color-3-saturation:      16.8%;
+    --secondary-color-3-lightness:       31.6%;
+
+--secondary-color-4: hsl(220.0, 16.5%, 35.7%);
+    --secondary-color-4-hue:             220.0;
+    --secondary-color-4-saturation:      16.5%;
+    --secondary-color-4-lightness:       35.7%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
+    --theme-nuance-color-1-hue:             178.7;
+    --theme-nuance-color-1-saturation:      25.1%;
+    --theme-nuance-color-1-lightness:       64.9%;
+
+--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
+    --theme-nuance-color-2-hue:             193.3;
+    --theme-nuance-color-2-saturation:      43.4%;
+    --theme-nuance-color-2-lightness:       67.5%;
+
+--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
+    --theme-nuance-color-3-hue:             210.0;
+    --theme-nuance-color-3-saturation:      34.0%;
+    --theme-nuance-color-3-lightness:       63.1%;
+
+--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
+    --theme-nuance-color-4-hue:             213.1;
+    --theme-nuance-color-4-saturation:      32.0%;
+    --theme-nuance-color-4-lightness:       52.2%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+--theme-red-color:    hsl(32.5, 80%, 50%);
+--theme-orange-color: hsl(32.5, 70%, 45%);
+--theme-yellow-color: hsl(40.0,   0.6%, 73.3%);
+--theme-green-color:  hsl(92.4,  27.8%, 64.7%);
+--theme-purple-color: hsl(311.1, 20.2%, 63.1%);
+
+
+
+/* ------------------------------------------- */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--theme-nuance-color-3);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+--chat-id-color:         var(--theme-nuance-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--primary-color-1);
+--button-alert-color-hover:      var(--theme-orange-color);
+--button-alert-border-hover:     var(--theme-orange-color);
+
+--button-alert-text-active:      var(--primary-color-1);
+--button-alert-color-active:     var(--theme-red-color);
+--button-alert-border-active:    var(--theme-red-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text:   var(--secondary-color-1);
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(217.5,
+    calc(var(--secondary-color-1-saturation) + 35%),
+    calc(var(--secondary-color-1-lightness)  - 30%));
+
+--button-primary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) -  2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) -  2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 35%));
+
+--button-primary-color-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 25%));
+
+--button-primary-border-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 25%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 50%));
+
+--button-secondary-color:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+--button-secondary-border:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 22%),
+    calc(var(--theme-nuance-color-3-lightness)  +  1%));
+
+--button-secondary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 22%),
+    calc(var(--theme-nuance-color-3-lightness)  +  1%));
+
+
+/* ---------active--------- */
+--button-secondary-text-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) + 40%),
+    calc(var(--theme-nuance-color-3-lightness)  - 55%));
+
+--button-secondary-color-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 30%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-secondary-border-active:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 30%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-tertiary-color:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+--button-tertiary-border:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+/* ---------hover---------- */
+--button-tertiary-text-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  -  5%));
+
+--button-tertiary-color-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+--button-tertiary-border-hover:
+    hsl(210,
+    calc(var(--theme-nuance-color-3-saturation) - 40%),
+    calc(var(--theme-nuance-color-3-lightness)  + 20%));
+
+}
diff --git a/llama.cpp/tools/server/public_simplechat/datautils.mjs b/llama.cpp/tools/server/public_simplechat/datautils.mjs
new file mode 100644
index 0000000..75159d6
--- /dev/null
+++ b/llama.cpp/tools/server/public_simplechat/datautils.mjs
@@ -0,0 +1,266 @@
+//@ts-check
+// Helpers to work with different data types
+// by Humans for All
+//
+
+/**
+ * Given the limited context size of local LLMs and , many a times when context gets filled
+ * between the prompt and the response, it can lead to repeating text garbage generation.
+ * And many a times setting penalty wrt repeatation leads to over-intelligent garbage
+ * repeatation with slight variations. These garbage inturn can lead to overloading of the
+ * available model context, leading to less valuable response for subsequent prompts/queries,
+ * if chat history is sent to ai model.
+ *
+ * So two simple minded garbage trimming logics are experimented below.
+ * * one based on progressively-larger-substring-based-repeat-matching-with-partial-skip and
+ * * another based on char-histogram-driven garbage trimming.
+ *   * in future characteristic of histogram over varying lengths could be used to allow for
+ *     a more aggressive and adaptive trimming logic.
+ */
+
+
+/**
+ * Simple minded logic to help remove repeating garbage at end of the string.
+ * The repeatation needs to be perfectly matching.
+ *
+ * The logic progressively goes on probing for longer and longer substring based
+ * repeatation, till there is no longer repeatation. Inturn picks the one with
+ * the longest chain.
+ *
+ * @param {string} sIn
+ * @param {number} maxSubL
+ * @param {number} maxMatchLenThreshold
+ */
+export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold=40) {
+    let rCnt = [0];
+    let maxMatchLen = maxSubL;
+    let iMML = -1;
+    for(let subL=1; subL < maxSubL; subL++) {
+        rCnt.push(0);
+        let i;
+        let refS = sIn.substring(sIn.length-subL, sIn.length);
+        for(i=sIn.length; i > 0; i -= subL) {
+            let curS = sIn.substring(i-subL, i);
+            if (refS != curS) {
+                let curMatchLen = rCnt[subL]*subL;
+                if (maxMatchLen < curMatchLen) {
+                    maxMatchLen = curMatchLen;
+                    iMML = subL;
+                }
+                break;
+            }
+            rCnt[subL] += 1;
+        }
+    }
+    console.debug("DBUG:DU:TrimRepeatGarbage:", rCnt);
+    if ((iMML == -1) || (maxMatchLen < maxMatchLenThreshold)) {
+        return {trimmed: false, data: sIn};
+    }
+    console.debug("DBUG:TrimRepeatGarbage:TrimmedCharLen:", maxMatchLen);
+    let iEnd = sIn.length - maxMatchLen;
+    return { trimmed: true, data: sIn.substring(0, iEnd) };
+}
+
+
+/**
+ * Simple minded logic to help remove repeating garbage at end of the string, till it cant.
+ * If its not able to trim, then it will try to skip a char at end and then trim, a few times.
+ * This ensures that even if there are multiple runs of garbage with different patterns, the
+ * logic still tries to munch through them.
+ *
+ * @param {string} sIn
+ * @param {number} maxSubL
+ * @param {number | undefined} [maxMatchLenThreshold]
+ */
+export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThreshold, skipMax=16) {
+    let sCur = sIn;
+    let sSaved = "";
+    let iTry = 0;
+    while(true) {
+        let got = trim_repeat_garbage_at_end(sCur, maxSubL, maxMatchLenThreshold);
+        if (got.trimmed != true) {
+            if (iTry == 0) {
+                sSaved = got.data;
+            }
+            iTry += 1;
+            if (iTry >= skipMax) {
+                return sSaved;
+            }
+            got.data = got.data.substring(0,got.data.length-1);
+        } else {
+            iTry = 0;
+        }
+        sCur = got.data;
+    }
+}
+
+
+/**
+ * A simple minded try trim garbage at end using histogram driven characteristics.
+ * There can be variation in the repeatations, as long as no new char props up.
+ *
+ * This tracks the chars and their frequency in a specified length of substring at the end
+ * and inturn checks if moving further into the generated text from the end remains within
+ * the same char subset or goes beyond it and based on that either trims the string at the
+ * end or not. This allows to filter garbage at the end, including even if there are certain
+ * kind of small variations in the repeated text wrt position of seen chars.
+ *
+ * Allow the garbage to contain upto maxUniq chars, but at the same time ensure that
+ * a given type of char ie numerals or alphabets or other types dont cross the specified
+ * maxType limit. This allows intermixed text garbage to be identified and trimmed.
+ *
+ * ALERT: This is not perfect and only provides a rough garbage identification logic.
+ * Also it currently only differentiates between character classes wrt english.
+ *
+ * @param {string} sIn
+ * @param {number} maxType
+ * @param {number} maxUniq
+ * @param {number} maxMatchLenThreshold
+ */
+export function trim_hist_garbage_at_end(sIn, maxType, maxUniq, maxMatchLenThreshold) {
+    if (sIn.length < maxMatchLenThreshold) {
+        return { trimmed: false, data: sIn };
+    }
+    let iAlp = 0;
+    let iNum = 0;
+    let iOth = 0;
+    // Learn
+    let hist = {};
+    let iUniq = 0;
+    for(let i=0; i<maxMatchLenThreshold; i++) {
+        let c = sIn[sIn.length-1-i];
+        if (c in hist) {
+            hist[c] += 1;
+        } else {
+            if(c.match(/[0-9]/) != null) {
+                iNum += 1;
+            } else if(c.match(/[A-Za-z]/) != null) {
+                iAlp += 1;
+            } else {
+                iOth += 1;
+            }
+            iUniq += 1;
+            if (iUniq >= maxUniq) {
+                break;
+            }
+            hist[c] = 1;
+        }
+    }
+    console.debug("DBUG:TrimHistGarbage:", hist);
+    if ((iAlp > maxType) || (iNum > maxType) || (iOth > maxType)) {
+        return { trimmed: false, data: sIn };
+    }
+    // Catch and Trim
+    for(let i=0; i < sIn.length; i++) {
+        let c = sIn[sIn.length-1-i];
+        if (!(c in hist)) {
+            if (i < maxMatchLenThreshold) {
+                return { trimmed: false, data: sIn };
+            }
+            console.debug("DBUG:TrimHistGarbage:TrimmedCharLen:", i);
+            return { trimmed: true, data: sIn.substring(0, sIn.length-i+1) };
+        }
+    }
+    console.debug("DBUG:TrimHistGarbage:Trimmed fully");
+    return { trimmed: true, data: "" };
+}
+
+/**
+ * Keep trimming repeatedly using hist_garbage logic, till you no longer can.
+ * This ensures that even if there are multiple runs of garbage with different patterns,
+ * the logic still tries to munch through them.
+ *
+ * @param {any} sIn
+ * @param {number} maxType
+ * @param {number} maxUniq
+ * @param {number} maxMatchLenThreshold
+ */
+export function trim_hist_garbage_at_end_loop(sIn, maxType, maxUniq, maxMatchLenThreshold) {
+    let sCur = sIn;
+    while (true) {
+        let got = trim_hist_garbage_at_end(sCur, maxType, maxUniq, maxMatchLenThreshold);
+        if (!got.trimmed) {
+            return got.data;
+        }
+        sCur = got.data;
+    }
+}
+
+/**
+ * Try trim garbage at the end by using both the hist-driven-garbage-trimming as well as
+ * skip-a-bit-if-reqd-then-repeat-pattern-based-garbage-trimming, with blind retrying.
+ * @param {string} sIn
+ */
+export function trim_garbage_at_end(sIn) {
+    let sCur = sIn;
+    for(let i=0; i<2; i++) {
+        sCur = trim_hist_garbage_at_end_loop(sCur, 8, 24, 72);
+        sCur = trim_repeat_garbage_at_end_loop(sCur, 32, 72, 12);
+    }
+    return sCur;
+}
+
+
+/**
+ * NewLines array helper.
+ * Allow for maintaining a list of lines.
+ * Allow for a line to be builtup/appended part by part.
+ */
+export class NewLines {
+
+    constructor() {
+        /** @type {string[]} */
+        this.lines = [];
+    }
+
+    /**
+     * Extracts lines from the passed string and inturn either
+     * append to a previous partial line or add a new line.
+     * @param {string} sLines
+     */
+    add_append(sLines) {
+        let aLines = sLines.split("\n");
+        let lCnt = 0;
+        for(let line of aLines) {
+            lCnt += 1;
+            // Add back newline removed if any during split
+            if (lCnt < aLines.length) {
+                line += "\n";
+            } else {
+                if (sLines.endsWith("\n")) {
+                    line += "\n";
+                }
+            }
+            // Append if required
+            if (lCnt == 1) {
+                let lastLine = this.lines[this.lines.length-1];
+                if (lastLine != undefined) {
+                    if (!lastLine.endsWith("\n")) {
+                        this.lines[this.lines.length-1] += line;
+                        continue;
+                    }
+                }
+            }
+            // Add new line
+            this.lines.push(line);
+        }
+    }
+
+    /**
+     * Shift the oldest/earliest/0th line in the array. [Old-New|Earliest-Latest]
+     * Optionally control whether only full lines (ie those with newline at end) will be returned
+     * or will a partial line without a newline at end (can only be the last line) be returned.
+     * @param {boolean} bFullWithNewLineOnly
+     */
+    shift(bFullWithNewLineOnly=true) {
+        let line = this.lines[0];
+        if (line == undefined) {
+            return undefined;
+        }
+        if ((line[line.length-1] != "\n") && bFullWithNewLineOnly){
+            return undefined;
+        }
+        return this.lines.shift();
+    }
+
+}
diff --git a/llama.cpp/tools/server/public_simplechat/index.html b/llama.cpp/tools/server/public_simplechat/index.html
new file mode 100644
index 0000000..f641301
--- /dev/null
+++ b/llama.cpp/tools/server/public_simplechat/index.html
@@ -0,0 +1,51 @@
+<!DOCTYPE html>
+<html lang="en">
+    <head>
+        <title>SimpleChat LlamaCppEtal </title>
+        <meta charset="UTF-8" />
+        <meta name="viewport" content="width=device-width, initial-scale=1" />
+        <meta name="message" content="Save Nature Save Earth" />
+        <meta name="description" content="SimpleChat: trigger LLM web service endpoints /chat/completions and /completions, single/multi chat sessions" />
+        <meta name="author" content="by Humans for All" />
+        <meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
+        <script type="importmap">
+            {
+                "imports": {
+                    "datautils": "./datautils.mjs",
+                    "ui": "./ui.mjs"
+                }
+            }
+        </script>
+        <script src="simplechat.js" type="module" defer></script>
+        <link rel="stylesheet" href="simplechat.css" />
+    </head>
+    <body>
+        <div class="samecolumn" id="fullbody">
+
+            <div class="sameline" id="heading">
+                <p class="heading flex-grow" > <b> SimpleChat </b> </p>
+                <button id="settings">Settings</button>
+            </div>
+
+            <div id="sessions-div" class="sameline"></div>
+
+            <hr>
+            <div class="sameline">
+                <label for="system-in">System</label>
+                <textarea name="system" id="system-in" rows="2" placeholder="e.g. you are a helpful ai assistant, who provides concise answers" class="flex-grow"></textarea>
+            </div>
+
+            <hr>
+            <div id="chat-div">
+                <p> You need to have javascript enabled.</p>
+            </div>
+
+            <hr>
+            <div class="sameline">
+                <textarea id="user-in" class="flex-grow" rows="2" placeholder="enter your query to the ai model here" ></textarea>
+                <button id="user-btn">submit</button>
+            </div>
+
+        </div>
+    </body>
+</html>
diff --git a/llama.cpp/tools/server/public_simplechat/readme.md b/llama.cpp/tools/server/public_simplechat/readme.md
new file mode 100644
index 0000000..24e026d
--- /dev/null
+++ b/llama.cpp/tools/server/public_simplechat/readme.md
@@ -0,0 +1,286 @@
+
+# SimpleChat
+
+by Humans for All.
+
+## quickstart
+
+To run from the build dir
+
+bin/llama-server -m path/model.gguf --path ../tools/server/public_simplechat
+
+Continue reading for the details.
+
+## overview
+
+This simple web frontend, allows triggering/testing the server's /completions or /chat/completions endpoints
+in a simple way with minimal code from a common code base. Inturn additionally it tries to allow single or
+multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their
+own system prompts.
+
+This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
+or potentially as it is being generated, in a streamed manner from the server/ai-model.
+
+![Chat and Settings screens](./simplechat_screens.webp "Chat and Settings screens")
+
+Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
+open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
+
+The UI follows a responsive web design so that the layout can adapt to available display space in a usable
+enough manner, in general.
+
+Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
+console. Parallely some of the directly useful to end-user settings can also be changed using the provided
+settings ui.
+
+NOTE: Current web service api doesnt expose the model context length directly, so client logic doesnt provide
+any adaptive culling of old messages nor of replacing them with summary of their content etal. However there
+is a optional sliding window based chat logic, which provides a simple minded culling of old messages from
+the chat history before sending to the ai model.
+
+NOTE: Wrt options sent with the request, it mainly sets temperature, max_tokens and optionaly stream for now.
+However if someone wants they can update the js file or equivalent member in gMe as needed.
+
+NOTE: One may be able to use this to chat with openai api web-service /chat/completions endpoint, in a very
+limited / minimal way. One will need to set model, openai url and authorization bearer key in settings ui.
+
+
+## usage
+
+One could run this web frontend directly using server itself or if anyone is thinking of adding a built in web
+frontend to configure the server over http(s) or so, then run this web frontend using something like python's
+http module.
+
+### running using tools/server
+
+./llama-server -m path/model.gguf --path tools/server/public_simplechat [--port PORT]
+
+### running using python3's server module
+
+first run tools/server
+* ./llama-server -m path/model.gguf
+
+next run this web front end in tools/server/public_simplechat
+* cd ../tools/server/public_simplechat
+* python3 -m http.server PORT
+
+### using the front end
+
+Open this simple web front end from your local browser
+
+* http://127.0.0.1:PORT/index.html
+
+Once inside
+
+* If you want to, you can change many of the default global settings
+  * the base url (ie ip addr / domain name, port)
+  * chat (default) vs completion mode
+  * try trim garbage in response or not
+  * amount of chat history in the context sent to server/ai-model
+  * oneshot or streamed mode.
+
+* In completion mode
+  * one normally doesnt use a system prompt in completion mode.
+  * logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message.
+    If the model requires any prefix wrt user role messages, then the end user has to
+    explicitly add the needed prefix, when they enter their chat message.
+    Similarly if the model requires any prefix to trigger assistant/ai-model response,
+    then the end user needs to enter the same.
+    This keeps the logic simple, while still giving flexibility to the end user to
+    manage any templating/tagging requirement wrt their messages to the model.
+  * the logic doesnt insert newline at the begining and end wrt the prompt message generated.
+    However if the chat being sent to /completions end point has more than one role's message,
+    then insert newline when moving from one role's message to the next role's message, so
+    that it can be clearly identified/distinguished.
+  * given that /completions endpoint normally doesnt add additional chat-templating of its
+    own, the above ensures that end user can create a custom single/multi message combo with
+    any tags/special-tokens related chat templating to test out model handshake. Or enduser
+    can use it just for normal completion related/based query.
+
+* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
+  Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
+  responses with a suitable system prompt.
+  * if chat.add_system_begin is used
+    * you cant change the system prompt, after it is has been submitted once along with user query.
+    * you cant set a system prompt, after you have submitted any user query
+  * if chat.add_system_anytime is used
+    * one can change the system prompt any time during chat, by changing the contents of system prompt.
+    * inturn the updated/changed system prompt will be inserted into the chat session.
+    * this allows for the subsequent user chatting to be driven by the new system prompt set above.
+
+* Enter your query and either press enter or click on the submit button.
+  If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
+
+* Wait for the logic to communicate with the server and get the response.
+  * the user is not allowed to enter any fresh query during this time.
+  * the user input box will be disabled and a working message will be shown in it.
+  * if trim garbage is enabled, the logic will try to trim repeating text kind of garbage to some extent.
+
+* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
+
+* Using NewChat one can start independent chat sessions.
+  * two independent chat sessions are setup by default.
+
+* When you want to print, switching ChatHistoryInCtxt to Full and clicking on the chat session button of
+  interest, will display the full chat history till then wrt same, if you want full history for printing.
+
+
+## Devel note
+
+### Reason behind this
+
+The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
+by developers who may not be from web frontend background (so inturn may not be familiar with template /
+end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
+
+And given that the idea is also to help explore/experiment for developers, some flexibility is provided
+to change behaviour easily using the devel-tools/console or provided minimal settings ui (wrt few aspects).
+Skeletal logic has been implemented to explore some of the end points and ideas/implications around them.
+
+
+### General
+
+Me/gMe consolidates the settings which control the behaviour into one object.
+One can see the current settings, as well as change/update them using browsers devel-tool/console.
+It is attached to the document object. Some of these can also be updated using the Settings UI.
+
+  baseURL - the domain-name/ip-address and inturn the port to send the request.
+
+  bStream - control between oneshot-at-end and live-stream-as-its-generated collating and showing
+  of the generated response.
+
+    the logic assumes that the text sent from the server follows utf-8 encoding.
+
+    in streaming mode - if there is any exception, the logic traps the same and tries to ensure
+    that text generated till then is not lost.
+
+      if a very long text is being generated, which leads to no user interaction for sometime and
+      inturn the machine goes into power saving mode or so, the platform may stop network connection,
+      leading to exception.
+
+  apiEP - select between /completions and /chat/completions endpoint provided by the server/ai-model.
+
+  bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when
+  communicating with the server or only sends the latest user query/message.
+
+  bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the
+  messages that get inserted into prompt field wrt /Completion endpoint.
+
+  bTrimGarbage - whether garbage repeatation at the end of the generated ai response, should be
+  trimmed or left as is. If enabled, it will be trimmed so that it wont be sent back as part of
+  subsequent chat history. At the same time the actual trimmed text is shown to the user, once
+  when it was generated, so user can check if any useful info/data was there in the response.
+
+    One may be able to request the ai-model to continue (wrt the last response) (if chat-history
+    is enabled as part of the chat-history-in-context setting), and chances are the ai-model will
+    continue starting from the trimmed part, thus allows long response to be recovered/continued
+    indirectly, in many cases.
+
+    The histogram/freq based trimming logic is currently tuned for english language wrt its
+    is-it-a-alpabetic|numeral-char regex match logic.
+
+  apiRequestOptions - maintains the list of options/fields to send along with api request,
+  irrespective of whether /chat/completions or /completions endpoint.
+
+    If you want to add additional options/fields to send to the server/ai-model, and or
+    modify the existing options value or remove them, for now you can update this global var
+    using browser's development-tools/console.
+
+    For string, numeric and boolean fields in apiRequestOptions, including even those added by a
+    user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
+    created.
+
+    cache_prompt option supported by example/server is allowed to be controlled by user, so that
+    any caching supported wrt system-prompt and chat history, if usable can get used. When chat
+    history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
+    wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
+    However system prompt should ideally get the benefit of caching.
+
+  headers - maintains the list of http headers sent when request is made to the server. By default
+  Content-Type is set to application/json. Additionally Authorization entry is provided, which can
+  be set if needed using the settings ui.
+
+  iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end.
+  This is disabled by default. However if enabled, then in addition to latest system message, only
+  the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses
+  from the ai model will be sent to the ai-model, when querying for a new response. IE if enabled,
+  only user messages after the latest system message/prompt will be considered.
+
+    This specified sliding window user message count also includes the latest user query.
+    <0 : Send entire chat history to server
+     0 : Send only the system message if any to the server
+    >0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
+
+
+By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
+the implications of loading of the ai-model's context window by chat history, wrt chat response to
+some extent in a simple crude way. You may also want to control the context size enabled when the
+server loads ai-model, on the server end.
+
+
+Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
+may not be visible. Also remember that just refreshing/reloading page in browser or for that
+matter clearing site data, dont directly override site caching in all cases. Worst case you may
+have to change port. Or in dev tools of browser, you may be able to disable caching fully.
+
+
+Currently the server to communicate with is maintained globally and not as part of a specific
+chat session. So if one changes the server ip/url in setting, then all chat sessions will auto
+switch to this new server, when you try using those sessions.
+
+
+By switching between chat.add_system_begin/anytime, one can control whether one can change
+the system prompt, anytime during the conversation or only at the beginning.
+
+
+### Default setup
+
+By default things are setup to try and make the user experience a bit better, if possible.
+However a developer when testing the server of ai-model may want to change these value.
+
+Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be
+just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of
+full chat history. This way if there is any response with garbage/repeatation, it doesnt
+mess with things beyond the next question/request/query, in some ways. The trim garbage
+option also tries to help avoid issues with garbage in the context to an extent.
+
+Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
+available wrt next query-response. However dont forget that the server when started should
+also be started with a model context size of 1k or more, to be on safe side.
+
+  The /completions endpoint of tools/server doesnt take max_tokens, instead it takes the
+  internal n_predict, for now add the same here on the client side, maybe later add max_tokens
+  to /completions endpoint handling code on server side.
+
+NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
+wrt the set of fields sent to server along with the user query, to check how the model behaves
+wrt repeatations in general in the generated text response.
+
+A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
+using the provided settings ui (for settings exposed through the ui).
+
+
+### OpenAi / Equivalent API WebService
+
+One may be abe to handshake with OpenAI/Equivalent api web service's /chat/completions endpoint
+for a minimal chatting experimentation by setting the below.
+
+* the baseUrl in settings ui
+  * https://api.openai.com/v1 or similar
+
+* Wrt request body - gMe.apiRequestOptions
+  * model (settings ui)
+  * any additional fields if required in future
+
+* Wrt request headers - gMe.headers
+  * Authorization (available through settings ui)
+    * Bearer THE_OPENAI_API_KEY
+  * any additional optional header entries like "OpenAI-Organization", "OpenAI-Project" or so
+
+NOTE: Not tested, as there is no free tier api testing available. However logically this might
+work.
+
+
+## At the end
+
+Also a thank you to all open source and open model developers, who strive for the common good.
diff --git a/llama.cpp/tools/server/public_simplechat/simplechat.css b/llama.cpp/tools/server/public_simplechat/simplechat.css
new file mode 100644
index 0000000..13bfb80
--- /dev/null
+++ b/llama.cpp/tools/server/public_simplechat/simplechat.css
@@ -0,0 +1,79 @@
+/**
+ * the styling of the simplechat web frontend
+ * by Humans for All
+ */
+
+#fullbody {
+    height: 98vh;
+}
+
+.heading {
+    background-color: lightgray;
+}
+
+.session-selected {
+    background-color: lightblue;
+}
+
+.role-system {
+    background-color: lightblue;
+}
+.role-user {
+    background-color: lightgray;
+}
+.role-trim {
+    background-color: lightpink;
+}
+
+.gridx2 {
+    display: grid;
+    grid-template-columns: repeat(2, 1fr);
+    border-bottom-style: dotted;
+    border-bottom-width: thin;
+    border-bottom-color: lightblue;
+}
+
+.flex-grow {
+    flex-grow: 1;
+}
+.float-right {
+    float: right;
+}
+
+#chat-div {
+    overflow: scroll;
+    flex-grow: 1;
+    flex-shrink: 1;
+    min-height: 40vh;
+}
+button {
+    min-width: 8vw;
+}
+
+.sameline {
+    display: flex;
+    flex-direction: row;
+}
+.samecolumn {
+    display: flex;
+    flex-direction: column;
+}
+
+.ul1 {
+    padding-inline-start: 2vw;
+}
+.ul2 {
+    padding-inline-start: 2vw;
+}
+
+* {
+    margin: 0.6vmin;
+}
+
+@media print {
+
+    #fullbody {
+        height: auto;
+    }
+
+}
diff --git a/llama.cpp/tools/server/public_simplechat/simplechat.js b/llama.cpp/tools/server/public_simplechat/simplechat.js
new file mode 100644
index 0000000..2fcd24a
--- /dev/null
+++ b/llama.cpp/tools/server/public_simplechat/simplechat.js
@@ -0,0 +1,929 @@
+// @ts-check
+// A simple completions and chat/completions test related web front end logic
+// by Humans for All
+
+import * as du from "./datautils.mjs";
+import * as ui from "./ui.mjs"
+
+class Roles {
+    static System = "system";
+    static User = "user";
+    static Assistant = "assistant";
+}
+
+class ApiEP {
+    static Type = {
+        Chat: "chat",
+        Completion: "completion",
+    }
+    static UrlSuffix = {
+        'chat': `/chat/completions`,
+        'completion': `/completions`,
+    }
+
+    /**
+     * Build the url from given baseUrl and apiEp id.
+     * @param {string} baseUrl
+     * @param {string} apiEP
+     */
+    static Url(baseUrl, apiEP) {
+        if (baseUrl.endsWith("/")) {
+            baseUrl = baseUrl.substring(0, baseUrl.length-1);
+        }
+        return `${baseUrl}${this.UrlSuffix[apiEP]}`;
+    }
+
+}
+
+
+let gUsageMsg = `
+    <p class="role-system">Usage</p>
+    <ul class="ul1">
+    <li> System prompt above, to try control ai response characteristics.</li>
+        <ul class="ul2">
+        <li> Completion mode - no system prompt normally.</li>
+        </ul>
+    <li> Use shift+enter for inserting enter/newline.</li>
+    <li> Enter your query to ai assistant below.</li>
+    <li> Default ContextWindow = [System, Last Query+Resp, Cur Query].</li>
+        <ul class="ul2">
+        <li> ChatHistInCtxt, MaxTokens, ModelCtxt window to expand</li>
+        </ul>
+    </ul>
+`;
+
+
+/** @typedef {{role: string, content: string}[]} ChatMessages */
+
+/** @typedef {{iLastSys: number, xchat: ChatMessages}} SimpleChatODS */
+
+class SimpleChat {
+
+    /**
+     * @param {string} chatId
+     */
+    constructor(chatId) {
+        this.chatId = chatId;
+        /**
+         * Maintain in a form suitable for common LLM web service chat/completions' messages entry
+         * @type {ChatMessages}
+         */
+        this.xchat = [];
+        this.iLastSys = -1;
+        this.latestResponse = "";
+    }
+
+    clear() {
+        this.xchat = [];
+        this.iLastSys = -1;
+    }
+
+    ods_key() {
+        return `SimpleChat-${this.chatId}`
+    }
+
+    save() {
+        /** @type {SimpleChatODS} */
+        let ods = {iLastSys: this.iLastSys, xchat: this.xchat};
+        localStorage.setItem(this.ods_key(), JSON.stringify(ods));
+    }
+
+    load() {
+        let sods = localStorage.getItem(this.ods_key());
+        if (sods == null) {
+            return;
+        }
+        /** @type {SimpleChatODS} */
+        let ods = JSON.parse(sods);
+        this.iLastSys = ods.iLastSys;
+        this.xchat = ods.xchat;
+    }
+
+    /**
+     * Recent chat messages.
+     * If iRecentUserMsgCnt < 0
+     *   Then return the full chat history
+     * Else
+     *   Return chat messages from latest going back till the last/latest system prompt.
+     *   While keeping track that the number of user queries/messages doesnt exceed iRecentUserMsgCnt.
+     * @param {number} iRecentUserMsgCnt
+     */
+    recent_chat(iRecentUserMsgCnt) {
+        if (iRecentUserMsgCnt < 0) {
+            return this.xchat;
+        }
+        if (iRecentUserMsgCnt == 0) {
+            console.warn("WARN:SimpleChat:SC:RecentChat:iRecentUsermsgCnt of 0 means no user message/query sent");
+        }
+        /** @type{ChatMessages} */
+        let rchat = [];
+        let sysMsg = this.get_system_latest();
+        if (sysMsg.length != 0) {
+            rchat.push({role: Roles.System, content: sysMsg});
+        }
+        let iUserCnt = 0;
+        let iStart = this.xchat.length;
+        for(let i=this.xchat.length-1; i > this.iLastSys; i--) {
+            if (iUserCnt >= iRecentUserMsgCnt) {
+                break;
+            }
+            let msg = this.xchat[i];
+            if (msg.role == Roles.User) {
+                iStart = i;
+                iUserCnt += 1;
+            }
+        }
+        for(let i = iStart; i < this.xchat.length; i++) {
+            let msg = this.xchat[i];
+            if (msg.role == Roles.System) {
+                continue;
+            }
+            rchat.push({role: msg.role, content: msg.content});
+        }
+        return rchat;
+    }
+
+    /**
+     * Collate the latest response from the server/ai-model, as it is becoming available.
+     * This is mainly useful for the stream mode.
+     * @param {string} content
+     */
+    append_response(content) {
+        this.latestResponse += content;
+    }
+
+    /**
+     * Add an entry into xchat
+     * @param {string} role
+     * @param {string|undefined|null} content
+     */
+    add(role, content) {
+        if ((content == undefined) || (content == null) || (content == "")) {
+            return false;
+        }
+        this.xchat.push( {role: role, content: content} );
+        if (role == Roles.System) {
+            this.iLastSys = this.xchat.length - 1;
+        }
+        this.save();
+        return true;
+    }
+
+    /**
+     * Show the contents in the specified div
+     * @param {HTMLDivElement} div
+     * @param {boolean} bClear
+     */
+    show(div, bClear=true) {
+        if (bClear) {
+            div.replaceChildren();
+        }
+        let last = undefined;
+        for(const x of this.recent_chat(gMe.iRecentUserMsgCnt)) {
+            let entry = ui.el_create_append_p(`${x.role}: ${x.content}`, div);
+            entry.className = `role-${x.role}`;
+            last = entry;
+        }
+        if (last !== undefined) {
+            last.scrollIntoView(false);
+        } else {
+            if (bClear) {
+                div.innerHTML = gUsageMsg;
+                gMe.setup_load(div, this);
+                gMe.show_info(div);
+            }
+        }
+        return last;
+    }
+
+    /**
+     * Setup the fetch headers.
+     * It picks the headers from gMe.headers.
+     * It inserts Authorization only if its non-empty.
+     * @param {string} apiEP
+     */
+    fetch_headers(apiEP) {
+        let headers = new Headers();
+        for(let k in gMe.headers) {
+            let v = gMe.headers[k];
+            if ((k == "Authorization") && (v.trim() == "")) {
+                continue;
+            }
+            headers.append(k, v);
+        }
+        return headers;
+    }
+
+    /**
+     * Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
+     * The needed fields/options are picked from a global object.
+     * Add optional stream flag, if required.
+     * Convert the json into string.
+     * @param {Object} obj
+     */
+    request_jsonstr_extend(obj) {
+        for(let k in gMe.apiRequestOptions) {
+            obj[k] = gMe.apiRequestOptions[k];
+        }
+        if (gMe.bStream) {
+            obj["stream"] = true;
+        }
+        return JSON.stringify(obj);
+    }
+
+    /**
+     * Return a string form of json object suitable for chat/completions
+     */
+    request_messages_jsonstr() {
+        let req = {
+            messages: this.recent_chat(gMe.iRecentUserMsgCnt),
+        }
+        return this.request_jsonstr_extend(req);
+    }
+
+    /**
+     * Return a string form of json object suitable for /completions
+     * @param {boolean} bInsertStandardRolePrefix Insert "<THE_ROLE>: " as prefix wrt each role's message
+     */
+    request_prompt_jsonstr(bInsertStandardRolePrefix) {
+        let prompt = "";
+        let iCnt = 0;
+        for(const chat of this.recent_chat(gMe.iRecentUserMsgCnt)) {
+            iCnt += 1;
+            if (iCnt > 1) {
+                prompt += "\n";
+            }
+            if (bInsertStandardRolePrefix) {
+                prompt += `${chat.role}: `;
+            }
+            prompt += `${chat.content}`;
+        }
+        let req = {
+            prompt: prompt,
+        }
+        return this.request_jsonstr_extend(req);
+    }
+
+    /**
+     * Return a string form of json object suitable for specified api endpoint.
+     * @param {string} apiEP
+     */
+    request_jsonstr(apiEP) {
+        if (apiEP == ApiEP.Type.Chat) {
+            return this.request_messages_jsonstr();
+        } else {
+            return this.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix);
+        }
+    }
+
+    /**
+     * Extract the ai-model/assistant's response from the http response got.
+     * Optionally trim the message wrt any garbage at the end.
+     * @param {any} respBody
+     * @param {string} apiEP
+     */
+    response_extract(respBody, apiEP) {
+        let assistant = "";
+        if (apiEP == ApiEP.Type.Chat) {
+            assistant = respBody["choices"][0]["message"]["content"];
+        } else {
+            try {
+                assistant = respBody["choices"][0]["text"];
+            } catch {
+                assistant = respBody["content"];
+            }
+        }
+        return assistant;
+    }
+
+    /**
+     * Extract the ai-model/assistant's response from the http response got in streaming mode.
+     * @param {any} respBody
+     * @param {string} apiEP
+     */
+    response_extract_stream(respBody, apiEP) {
+        let assistant = "";
+        if (apiEP == ApiEP.Type.Chat) {
+            if (respBody["choices"][0]["finish_reason"] !== "stop") {
+                assistant = respBody["choices"][0]["delta"]["content"];
+            }
+        } else {
+            try {
+                assistant = respBody["choices"][0]["text"];
+            } catch {
+                assistant = respBody["content"];
+            }
+        }
+        return assistant;
+    }
+
+    /**
+     * Allow setting of system prompt, but only at begining.
+     * @param {string} sysPrompt
+     * @param {string} msgTag
+     */
+    add_system_begin(sysPrompt, msgTag) {
+        if (this.xchat.length == 0) {
+            if (sysPrompt.length > 0) {
+                return this.add(Roles.System, sysPrompt);
+            }
+        } else {
+            if (sysPrompt.length > 0) {
+                if (this.xchat[0].role !== Roles.System) {
+                    console.error(`ERRR:SimpleChat:SC:${msgTag}:You need to specify system prompt before any user query, ignoring...`);
+                } else {
+                    if (this.xchat[0].content !== sysPrompt) {
+                        console.error(`ERRR:SimpleChat:SC:${msgTag}:You cant change system prompt, mid way through, ignoring...`);
+                    }
+                }
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Allow setting of system prompt, at any time.
+     * @param {string} sysPrompt
+     * @param {string} msgTag
+     */
+    add_system_anytime(sysPrompt, msgTag) {
+        if (sysPrompt.length <= 0) {
+            return false;
+        }
+
+        if (this.iLastSys < 0) {
+            return this.add(Roles.System, sysPrompt);
+        }
+
+        let lastSys = this.xchat[this.iLastSys].content;
+        if (lastSys !== sysPrompt) {
+            return this.add(Roles.System, sysPrompt);
+        }
+        return false;
+    }
+
+    /**
+     * Retrieve the latest system prompt.
+     */
+    get_system_latest() {
+        if (this.iLastSys == -1) {
+            return "";
+        }
+        let sysPrompt = this.xchat[this.iLastSys].content;
+        return sysPrompt;
+    }
+
+
+    /**
+     * Handle the multipart response from server/ai-model
+     * @param {Response} resp
+     * @param {string} apiEP
+     * @param {HTMLDivElement} elDiv
+     */
+    async handle_response_multipart(resp, apiEP, elDiv) {
+        let elP = ui.el_create_append_p("", elDiv);
+        if (!resp.body) {
+            throw Error("ERRR:SimpleChat:SC:HandleResponseMultiPart:No body...");
+        }
+        let tdUtf8 = new TextDecoder("utf-8");
+        let rr = resp.body.getReader();
+        this.latestResponse = "";
+        let xLines = new du.NewLines();
+        while(true) {
+            let { value: cur,  done: done } = await rr.read();
+            if (cur) {
+                let curBody = tdUtf8.decode(cur, {stream: true});
+                console.debug("DBUG:SC:PART:Str:", curBody);
+                xLines.add_append(curBody);
+            }
+            while(true) {
+                let curLine = xLines.shift(!done);
+                if (curLine == undefined) {
+                    break;
+                }
+                if (curLine.trim() == "") {
+                    continue;
+                }
+                if (curLine.startsWith("data:")) {
+                    curLine = curLine.substring(5);
+                }
+                if (curLine.trim() === "[DONE]") {
+                    break;
+                }
+                let curJson = JSON.parse(curLine);
+                console.debug("DBUG:SC:PART:Json:", curJson);
+                this.append_response(this.response_extract_stream(curJson, apiEP));
+            }
+            elP.innerText = this.latestResponse;
+            elP.scrollIntoView(false);
+            if (done) {
+                break;
+            }
+        }
+        console.debug("DBUG:SC:PART:Full:", this.latestResponse);
+        return this.latestResponse;
+    }
+
+    /**
+     * Handle the oneshot response from server/ai-model
+     * @param {Response} resp
+     * @param {string} apiEP
+     */
+    async handle_response_oneshot(resp, apiEP) {
+        let respBody = await resp.json();
+        console.debug(`DBUG:SimpleChat:SC:${this.chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
+        return this.response_extract(respBody, apiEP);
+    }
+
+    /**
+     * Handle the response from the server be it in oneshot or multipart/stream mode.
+     * Also take care of the optional garbage trimming.
+     * @param {Response} resp
+     * @param {string} apiEP
+     * @param {HTMLDivElement} elDiv
+     */
+    async handle_response(resp, apiEP, elDiv) {
+        let theResp = {
+            assistant: "",
+            trimmed: "",
+        }
+        if (gMe.bStream) {
+            try {
+                theResp.assistant = await this.handle_response_multipart(resp, apiEP, elDiv);
+                this.latestResponse = "";
+            } catch (error) {
+                theResp.assistant = this.latestResponse;
+                this.add(Roles.Assistant, theResp.assistant);
+                this.latestResponse = "";
+                throw error;
+            }
+        } else {
+            theResp.assistant = await this.handle_response_oneshot(resp, apiEP);
+        }
+        if (gMe.bTrimGarbage) {
+            let origMsg = theResp.assistant;
+            theResp.assistant = du.trim_garbage_at_end(origMsg);
+            theResp.trimmed = origMsg.substring(theResp.assistant.length);
+        }
+        this.add(Roles.Assistant, theResp.assistant);
+        return theResp;
+    }
+
+}
+
+
+class MultiChatUI {
+
+    constructor() {
+        /** @type {Object<string, SimpleChat>} */
+        this.simpleChats = {};
+        /** @type {string} */
+        this.curChatId = "";
+
+        // the ui elements
+        this.elInSystem = /** @type{HTMLInputElement} */(document.getElementById("system-in"));
+        this.elDivChat = /** @type{HTMLDivElement} */(document.getElementById("chat-div"));
+        this.elBtnUser = /** @type{HTMLButtonElement} */(document.getElementById("user-btn"));
+        this.elInUser = /** @type{HTMLInputElement} */(document.getElementById("user-in"));
+        this.elDivHeading = /** @type{HTMLSelectElement} */(document.getElementById("heading"));
+        this.elDivSessions = /** @type{HTMLDivElement} */(document.getElementById("sessions-div"));
+        this.elBtnSettings = /** @type{HTMLButtonElement} */(document.getElementById("settings"));
+
+        this.validate_element(this.elInSystem, "system-in");
+        this.validate_element(this.elDivChat, "chat-div");
+        this.validate_element(this.elInUser, "user-in");
+        this.validate_element(this.elDivHeading, "heading");
+        this.validate_element(this.elDivChat, "sessions-div");
+        this.validate_element(this.elBtnSettings, "settings");
+    }
+
+    /**
+     * Check if the element got
+     * @param {HTMLElement | null} el
+     * @param {string} msgTag
+     */
+    validate_element(el, msgTag) {
+        if (el == null) {
+            throw Error(`ERRR:SimpleChat:MCUI:${msgTag} element missing in html...`);
+        } else {
+            console.debug(`INFO:SimpleChat:MCUI:${msgTag} Id[${el.id}] Name[${el["name"]}]`);
+        }
+    }
+
+    /**
+     * Reset user input ui.
+     * * clear user input
+     * * enable user input
+     * * set focus to user input
+     */
+    ui_reset_userinput() {
+        this.elInUser.value = "";
+        this.elInUser.disabled = false;
+        this.elInUser.focus();
+    }
+
+    /**
+     * Setup the needed callbacks wrt UI, curChatId to defaultChatId and
+     * optionally switch to specified defaultChatId.
+     * @param {string} defaultChatId
+     * @param {boolean} bSwitchSession
+     */
+    setup_ui(defaultChatId, bSwitchSession=false) {
+
+        this.curChatId = defaultChatId;
+        if (bSwitchSession) {
+            this.handle_session_switch(this.curChatId);
+        }
+
+        this.elBtnSettings.addEventListener("click", (ev)=>{
+            this.elDivChat.replaceChildren();
+            gMe.show_settings(this.elDivChat);
+        });
+
+        this.elBtnUser.addEventListener("click", (ev)=>{
+            if (this.elInUser.disabled) {
+                return;
+            }
+            this.handle_user_submit(this.curChatId, gMe.apiEP).catch((/** @type{Error} */reason)=>{
+                let msg = `ERRR:SimpleChat\nMCUI:HandleUserSubmit:${this.curChatId}\n${reason.name}:${reason.message}`;
+                console.error(msg.replace("\n", ":"));
+                alert(msg);
+                this.ui_reset_userinput();
+            });
+        });
+
+        this.elInUser.addEventListener("keyup", (ev)=> {
+            // allow user to insert enter into their message using shift+enter.
+            // while just pressing enter key will lead to submitting.
+            if ((ev.key === "Enter") && (!ev.shiftKey)) {
+                let value = this.elInUser.value;
+                this.elInUser.value = value.substring(0,value.length-1);
+                this.elBtnUser.click();
+                ev.preventDefault();
+            }
+        });
+
+        this.elInSystem.addEventListener("keyup", (ev)=> {
+            // allow user to insert enter into the system prompt using shift+enter.
+            // while just pressing enter key will lead to setting the system prompt.
+            if ((ev.key === "Enter") && (!ev.shiftKey)) {
+                let value = this.elInSystem.value;
+                this.elInSystem.value = value.substring(0,value.length-1);
+                let chat = this.simpleChats[this.curChatId];
+                chat.add_system_anytime(this.elInSystem.value, this.curChatId);
+                chat.show(this.elDivChat);
+                ev.preventDefault();
+            }
+        });
+
+    }
+
+    /**
+     * Setup a new chat session and optionally switch to it.
+     * @param {string} chatId
+     * @param {boolean} bSwitchSession
+     */
+    new_chat_session(chatId, bSwitchSession=false) {
+        this.simpleChats[chatId] = new SimpleChat(chatId);
+        if (bSwitchSession) {
+            this.handle_session_switch(chatId);
+        }
+    }
+
+
+    /**
+     * Handle user query submit request, wrt specified chat session.
+     * @param {string} chatId
+     * @param {string} apiEP
+     */
+    async handle_user_submit(chatId, apiEP) {
+
+        let chat = this.simpleChats[chatId];
+
+        // In completion mode, if configured, clear any previous chat history.
+        // So if user wants to simulate a multi-chat based completion query,
+        // they will have to enter the full thing, as a suitable multiline
+        // user input/query.
+        if ((apiEP == ApiEP.Type.Completion) && (gMe.bCompletionFreshChatAlways)) {
+            chat.clear();
+        }
+
+        chat.add_system_anytime(this.elInSystem.value, chatId);
+
+        let content = this.elInUser.value;
+        if (!chat.add(Roles.User, content)) {
+            console.debug(`WARN:SimpleChat:MCUI:${chatId}:HandleUserSubmit:Ignoring empty user input...`);
+            return;
+        }
+        chat.show(this.elDivChat);
+
+        let theUrl = ApiEP.Url(gMe.baseURL, apiEP);
+        let theBody = chat.request_jsonstr(apiEP);
+
+        this.elInUser.value = "working...";
+        this.elInUser.disabled = true;
+        console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:${theUrl}:ReqBody:${theBody}`);
+        let theHeaders = chat.fetch_headers(apiEP);
+        let resp = await fetch(theUrl, {
+            method: "POST",
+            headers: theHeaders,
+            body: theBody,
+        });
+
+        let theResp = await chat.handle_response(resp, apiEP, this.elDivChat);
+        if (chatId == this.curChatId) {
+            chat.show(this.elDivChat);
+            if (theResp.trimmed.length > 0) {
+                let p = ui.el_create_append_p(`TRIMMED:${theResp.trimmed}`, this.elDivChat);
+                p.className="role-trim";
+            }
+        } else {
+            console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
+        }
+        this.ui_reset_userinput();
+    }
+
+    /**
+     * Show buttons for NewChat and available chat sessions, in the passed elDiv.
+     * If elDiv is undefined/null, then use this.elDivSessions.
+     * Take care of highlighting the selected chat-session's btn.
+     * @param {HTMLDivElement | undefined} elDiv
+     */
+    show_sessions(elDiv=undefined) {
+        if (!elDiv) {
+            elDiv = this.elDivSessions;
+        }
+        elDiv.replaceChildren();
+        // Btn for creating new chat session
+        let btnNew = ui.el_create_button("New CHAT", (ev)=> {
+            if (this.elInUser.disabled) {
+                console.error(`ERRR:SimpleChat:MCUI:NewChat:Current session [${this.curChatId}] awaiting response, ignoring request...`);
+                alert("ERRR:SimpleChat\nMCUI:NewChat\nWait for response to pending query, before starting new chat session");
+                return;
+            }
+            let chatId = `Chat${Object.keys(this.simpleChats).length}`;
+            let chatIdGot = prompt("INFO:SimpleChat\nMCUI:NewChat\nEnter id for new chat session", chatId);
+            if (!chatIdGot) {
+                console.error("ERRR:SimpleChat:MCUI:NewChat:Skipping based on user request...");
+                return;
+            }
+            this.new_chat_session(chatIdGot, true);
+            this.create_session_btn(elDiv, chatIdGot);
+            ui.el_children_config_class(elDiv, chatIdGot, "session-selected", "");
+        });
+        elDiv.appendChild(btnNew);
+        // Btns for existing chat sessions
+        let chatIds = Object.keys(this.simpleChats);
+        for(let cid of chatIds) {
+            let btn = this.create_session_btn(elDiv, cid);
+            if (cid == this.curChatId) {
+                btn.className = "session-selected";
+            }
+        }
+    }
+
+    create_session_btn(elDiv, cid) {
+        let btn = ui.el_create_button(cid, (ev)=>{
+            let target = /** @type{HTMLButtonElement} */(ev.target);
+            console.debug(`DBUG:SimpleChat:MCUI:SessionClick:${target.id}`);
+            if (this.elInUser.disabled) {
+                console.error(`ERRR:SimpleChat:MCUI:SessionClick:${target.id}:Current session [${this.curChatId}] awaiting response, ignoring switch...`);
+                alert("ERRR:SimpleChat\nMCUI:SessionClick\nWait for response to pending query, before switching");
+                return;
+            }
+            this.handle_session_switch(target.id);
+            ui.el_children_config_class(elDiv, target.id, "session-selected", "");
+        });
+        elDiv.appendChild(btn);
+        return btn;
+    }
+
+    /**
+     * Switch ui to the specified chatId and set curChatId to same.
+     * @param {string} chatId
+     */
+    async handle_session_switch(chatId) {
+        let chat = this.simpleChats[chatId];
+        if (chat == undefined) {
+            console.error(`ERRR:SimpleChat:MCUI:HandleSessionSwitch:${chatId} missing...`);
+            return;
+        }
+        this.elInSystem.value = chat.get_system_latest();
+        this.elInUser.value = "";
+        chat.show(this.elDivChat);
+        this.elInUser.focus();
+        this.curChatId = chatId;
+        console.log(`INFO:SimpleChat:MCUI:HandleSessionSwitch:${chatId} entered...`);
+    }
+
+}
+
+
+class Me {
+
+    constructor() {
+        this.baseURL = "http://127.0.0.1:8080";
+        this.defaultChatIds = [ "Default", "Other" ];
+        this.multiChat = new MultiChatUI();
+        this.bStream = true;
+        this.bCompletionFreshChatAlways = true;
+        this.bCompletionInsertStandardRolePrefix = false;
+        this.bTrimGarbage = true;
+        this.iRecentUserMsgCnt = 2;
+        this.sRecentUserMsgCnt = {
+            "Full": -1,
+            "Last0": 1,
+            "Last1": 2,
+            "Last2": 3,
+            "Last4": 5,
+        };
+        this.apiEP = ApiEP.Type.Chat;
+        this.headers = {
+            "Content-Type": "application/json",
+            "Authorization": "", // Authorization: Bearer OPENAI_API_KEY
+        }
+        // Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
+        this.apiRequestOptions = {
+            "model": "gpt-3.5-turbo",
+            "temperature": 0.7,
+            "max_tokens": 1024,
+            "n_predict": 1024,
+            "cache_prompt": false,
+            //"frequency_penalty": 1.2,
+            //"presence_penalty": 1.2,
+        };
+    }
+
+    /**
+     * Disable console.debug by mapping it to a empty function.
+     */
+    debug_disable() {
+        this.console_debug = console.debug;
+        console.debug = () => {
+
+        };
+    }
+
+    /**
+     * Setup the load saved chat ui.
+     * @param {HTMLDivElement} div
+     * @param {SimpleChat} chat
+     */
+    setup_load(div, chat) {
+        if (!(chat.ods_key() in localStorage)) {
+            return;
+        }
+        div.innerHTML += `<p class="role-system">Restore</p>
+        <p>Load previously saved chat session, if available</p>`;
+        let btn = ui.el_create_button(chat.ods_key(), (ev)=>{
+            console.log("DBUG:SimpleChat:SC:Load", chat);
+            chat.load();
+            queueMicrotask(()=>{
+                chat.show(div);
+                this.multiChat.elInSystem.value = chat.get_system_latest();
+            });
+        });
+        div.appendChild(btn);
+    }
+
+    /**
+     * Show the configurable parameters info in the passed Div element.
+     * @param {HTMLDivElement} elDiv
+     * @param {boolean} bAll
+     */
+    show_info(elDiv, bAll=false) {
+
+        let p = ui.el_create_append_p("Settings (devel-tools-console document[gMe])", elDiv);
+        p.className = "role-system";
+
+        if (bAll) {
+
+            ui.el_create_append_p(`baseURL:${this.baseURL}`, elDiv);
+
+            ui.el_create_append_p(`Authorization:${this.headers["Authorization"]}`, elDiv);
+
+            ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
+
+            ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
+
+            ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
+
+            ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
+
+            ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
+
+            ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
+
+        }
+
+        ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
+        ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
+
+    }
+
+    /**
+     * Auto create ui input elements for fields in apiRequestOptions
+     * Currently supports text and number field types.
+     * @param {HTMLDivElement} elDiv
+     */
+    show_settings_apirequestoptions(elDiv) {
+        let typeDict = {
+            "string": "text",
+            "number": "number",
+        };
+        let fs = document.createElement("fieldset");
+        let legend = document.createElement("legend");
+        legend.innerText = "ApiRequestOptions";
+        fs.appendChild(legend);
+        elDiv.appendChild(fs);
+        for(const k in this.apiRequestOptions) {
+            let val = this.apiRequestOptions[k];
+            let type = typeof(val);
+            if (((type == "string") || (type == "number"))) {
+                let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
+                    if (type == "number") {
+                        val = Number(val);
+                    }
+                    this.apiRequestOptions[k] = val;
+                });
+                fs.appendChild(inp.div);
+            } else if (type == "boolean") {
+                let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
+                    this.apiRequestOptions[k] = userVal;
+                });
+                fs.appendChild(bbtn.div);
+            }
+        }
+    }
+
+    /**
+     * Show settings ui for configurable parameters, in the passed Div element.
+     * @param {HTMLDivElement} elDiv
+     */
+    show_settings(elDiv) {
+
+        let inp = ui.el_creatediv_input("SetBaseURL", "BaseURL", "text", this.baseURL, (val)=>{
+            this.baseURL = val;
+        });
+        elDiv.appendChild(inp.div);
+
+        inp = ui.el_creatediv_input("SetAuthorization", "Authorization", "text", this.headers["Authorization"], (val)=>{
+            this.headers["Authorization"] = val;
+        });
+        inp.el.placeholder = "Bearer OPENAI_API_KEY";
+        elDiv.appendChild(inp.div);
+
+        let bb = ui.el_creatediv_boolbutton("SetStream", "Stream", {true: "[+] yes stream", false: "[-] do oneshot"}, this.bStream, (val)=>{
+            this.bStream = val;
+        });
+        elDiv.appendChild(bb.div);
+
+        bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
+            this.bTrimGarbage = val;
+        });
+        elDiv.appendChild(bb.div);
+
+        this.show_settings_apirequestoptions(elDiv);
+
+        let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
+            this.apiEP = ApiEP.Type[val];
+        });
+        elDiv.appendChild(sel.div);
+
+        sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
+            this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
+        });
+        elDiv.appendChild(sel.div);
+
+        bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
+            this.bCompletionFreshChatAlways = val;
+        });
+        elDiv.appendChild(bb.div);
+
+        bb = ui.el_creatediv_boolbutton("SetCompletionInsertStandardRolePrefix", "CompletionInsertStandardRolePrefix", {true: "[+] yes insert", false: "[-] dont insert"}, this.bCompletionInsertStandardRolePrefix, (val)=>{
+            this.bCompletionInsertStandardRolePrefix = val;
+        });
+        elDiv.appendChild(bb.div);
+
+    }
+
+}
+
+
+/** @type {Me} */
+let gMe;
+
+function startme() {
+    console.log("INFO:SimpleChat:StartMe:Starting...");
+    gMe = new Me();
+    gMe.debug_disable();
+    document["gMe"] = gMe;
+    document["du"] = du;
+    for (let cid of gMe.defaultChatIds) {
+        gMe.multiChat.new_chat_session(cid);
+    }
+    gMe.multiChat.setup_ui(gMe.defaultChatIds[0], true);
+    gMe.multiChat.show_sessions();
+}
+
+document.addEventListener("DOMContentLoaded", startme);
diff --git a/llama.cpp/tools/server/public_simplechat/simplechat_screens.webp b/llama.cpp/tools/server/public_simplechat/simplechat_screens.webp
new file mode 100644
index 0000000..ccea443
Binary files /dev/null and b/llama.cpp/tools/server/public_simplechat/simplechat_screens.webp differ
diff --git a/llama.cpp/tools/server/public_simplechat/ui.mjs b/llama.cpp/tools/server/public_simplechat/ui.mjs
new file mode 100644
index 0000000..b2d5b9a
--- /dev/null
+++ b/llama.cpp/tools/server/public_simplechat/ui.mjs
@@ -0,0 +1,211 @@
+//@ts-check
+// Helpers to work with html elements
+// by Humans for All
+//
+
+
+/**
+ * Set the class of the children, based on whether it is the idSelected or not.
+ * @param {HTMLDivElement} elBase
+ * @param {string} idSelected
+ * @param {string} classSelected
+ * @param {string} classUnSelected
+ */
+export function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") {
+    for(let child of elBase.children) {
+        if (child.id == idSelected) {
+            child.className = classSelected;
+        } else {
+            child.className = classUnSelected;
+        }
+    }
+}
+
+/**
+ * Create button and set it up.
+ * @param {string} id
+ * @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback
+ * @param {string | undefined} name
+ * @param {string | undefined} innerText
+ */
+export function el_create_button(id, callback, name=undefined, innerText=undefined) {
+    if (!name) {
+        name = id;
+    }
+    if (!innerText) {
+        innerText = id;
+    }
+    let btn = document.createElement("button");
+    btn.id = id;
+    btn.name = name;
+    btn.innerText = innerText;
+    btn.addEventListener("click", callback);
+    return btn;
+}
+
+/**
+ * Create a para and set it up. Optionaly append it to a passed parent.
+ * @param {string} text
+ * @param {HTMLElement | undefined} elParent
+ * @param {string | undefined} id
+ */
+export function el_create_append_p(text, elParent=undefined, id=undefined) {
+    let para = document.createElement("p");
+    para.innerText = text;
+    if (id) {
+        para.id = id;
+    }
+    if (elParent) {
+        elParent.appendChild(para);
+    }
+    return para;
+}
+
+/**
+ * Create a button which represents bool value using specified text wrt true and false.
+ * When ever user clicks the button, it will toggle the value and update the shown text.
+ *
+ * @param {string} id
+ * @param {{true: string, false: string}} texts
+ * @param {boolean} defaultValue
+ * @param {function(boolean):void} cb
+ */
+export function el_create_boolbutton(id, texts, defaultValue, cb) {
+    let el = document.createElement("button");
+    el["xbool"] = defaultValue;
+    el["xtexts"] = structuredClone(texts);
+    el.innerText = el["xtexts"][String(defaultValue)];
+    if (id) {
+        el.id = id;
+    }
+    el.addEventListener('click', (ev)=>{
+        el["xbool"] = !el["xbool"];
+        el.innerText = el["xtexts"][String(el["xbool"])];
+        cb(el["xbool"]);
+    })
+    return el;
+}
+
+/**
+ * Create a div wrapped button which represents bool value using specified text wrt true and false.
+ * @param {string} id
+ * @param {string} label
+ * @param {{ true: string; false: string; }} texts
+ * @param {boolean} defaultValue
+ * @param {(arg0: boolean) => void} cb
+ * @param {string} className
+ */
+export function el_creatediv_boolbutton(id, label, texts, defaultValue, cb, className="gridx2") {
+    let div = document.createElement("div");
+    div.className = className;
+    let lbl = document.createElement("label");
+    lbl.setAttribute("for", id);
+    lbl.innerText = label;
+    div.appendChild(lbl);
+    let btn = el_create_boolbutton(id, texts, defaultValue, cb);
+    div.appendChild(btn);
+    return { div: div, el: btn };
+}
+
+
+/**
+ * Create a select ui element, with a set of options to select from.
+ * * options: an object which contains name-value pairs
+ * * defaultOption: the value whose name should be choosen, by default.
+ * * cb : the call back returns the name string of the option selected.
+ *
+ * @param {string} id
+ * @param {Object<string,*>} options
+ * @param {*} defaultOption
+ * @param {function(string):void} cb
+ */
+export function el_create_select(id, options, defaultOption, cb) {
+    let el = document.createElement("select");
+    el["xselected"] = defaultOption;
+    el["xoptions"] = structuredClone(options);
+    for(let cur of Object.keys(options)) {
+        let op = document.createElement("option");
+        op.value = cur;
+        op.innerText = cur;
+        if (options[cur] == defaultOption) {
+            op.selected = true;
+        }
+        el.appendChild(op);
+    }
+    if (id) {
+        el.id = id;
+        el.name = id;
+    }
+    el.addEventListener('change', (ev)=>{
+        let target = /** @type{HTMLSelectElement} */(ev.target);
+        console.log("DBUG:UI:Select:", id, ":", target.value);
+        cb(target.value);
+    })
+    return el;
+}
+
+/**
+ * Create a div wrapped select ui element, with a set of options to select from.
+ *
+ * @param {string} id
+ * @param {any} label
+ * @param {{ [x: string]: any; }} options
+ * @param {any} defaultOption
+ * @param {(arg0: string) => void} cb
+ * @param {string} className
+ */
+export function el_creatediv_select(id, label, options, defaultOption, cb, className="gridx2") {
+    let div = document.createElement("div");
+    div.className = className;
+    let lbl = document.createElement("label");
+    lbl.setAttribute("for", id);
+    lbl.innerText = label;
+    div.appendChild(lbl);
+    let sel = el_create_select(id, options,defaultOption, cb);
+    div.appendChild(sel);
+    return { div: div, el: sel };
+}
+
+
+/**
+ * Create a input ui element.
+ *
+ * @param {string} id
+ * @param {string} type
+ * @param {any} defaultValue
+ * @param {function(any):void} cb
+ */
+export function el_create_input(id, type, defaultValue, cb) {
+    let el = document.createElement("input");
+    el.type = type;
+    el.value = defaultValue;
+    if (id) {
+        el.id = id;
+    }
+    el.addEventListener('change', (ev)=>{
+        cb(el.value);
+    })
+    return el;
+}
+
+/**
+ * Create a div wrapped input.
+ *
+ * @param {string} id
+ * @param {string} label
+ * @param {string} type
+ * @param {any} defaultValue
+ * @param {function(any):void} cb
+ * @param {string} className
+ */
+export function el_creatediv_input(id, label, type, defaultValue, cb, className="gridx2") {
+    let div = document.createElement("div");
+    div.className = className;
+    let lbl = document.createElement("label");
+    lbl.setAttribute("for", id);
+    lbl.innerText = label;
+    div.appendChild(lbl);
+    let el = el_create_input(id, type, defaultValue, cb);
+    div.appendChild(el);
+    return { div: div, el: el };
+}
diff --git a/llama.cpp/tools/server/server-common.cpp b/llama.cpp/tools/server/server-common.cpp
new file mode 100644
index 0000000..a853f65
--- /dev/null
+++ b/llama.cpp/tools/server/server-common.cpp
@@ -0,0 +1,1980 @@
+#include "common.h"
+#include "download.h"
+#include "log.h"
+#include "llama.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
+#include "chat.h"
+#include "base64.hpp"
+
+#include "server-common.h"
+
+#include <random>
+#include <sstream>
+#include <fstream>
+
+json format_error_response(const std::string & message, const enum error_type type) {
+    std::string type_str;
+    int code = 500;
+    switch (type) {
+        case ERROR_TYPE_INVALID_REQUEST:
+            type_str = "invalid_request_error";
+            code = 400;
+            break;
+        case ERROR_TYPE_AUTHENTICATION:
+            type_str = "authentication_error";
+            code = 401;
+            break;
+        case ERROR_TYPE_NOT_FOUND:
+            type_str = "not_found_error";
+            code = 404;
+            break;
+        case ERROR_TYPE_SERVER:
+            type_str = "server_error";
+            code = 500;
+            break;
+        case ERROR_TYPE_PERMISSION:
+            type_str = "permission_error";
+            code = 403;
+            break;
+        case ERROR_TYPE_NOT_SUPPORTED:
+            type_str = "not_supported_error";
+            code = 501;
+            break;
+        case ERROR_TYPE_UNAVAILABLE:
+            type_str = "unavailable_error";
+            code = 503;
+            break;
+        case ERROR_TYPE_EXCEED_CONTEXT_SIZE:
+            type_str = "exceed_context_size_error";
+            code = 400;
+            break;
+    }
+    return json {
+        {"code", code},
+        {"message", message},
+        {"type", type_str},
+    };
+}
+
+//
+// random string / id
+//
+
+std::string random_string() {
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+    std::random_device rd;
+    std::mt19937 generator(rd());
+
+    std::string result(32, ' ');
+
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
+    }
+
+    return result;
+}
+
+std::string gen_chatcmplid() {
+    return "chatcmpl-" + random_string();
+}
+
+std::string gen_tool_call_id() {
+    return random_string();
+}
+
+//
+// lora utils
+//
+
+bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras) {
+    bool found_alora = false;
+    for (const auto & lora : loras) {
+        if (lora.scale != 0) {
+            if (llama_adapter_get_alora_n_invocation_tokens(lora.ptr) == 0) {
+                return false;
+            }
+            found_alora = true;
+        }
+    }
+    return found_alora;
+}
+
+bool lora_should_clear_cache(
+        const std::vector<common_adapter_lora_info> & current,
+        const std::vector<common_adapter_lora_info> & next) {
+
+    // This should always be called after determining that the two sets are
+    // _not_ equal. This assert is therefore some slightly wasted work and
+    // should be safe to remove as long as this method is called correctly.
+    GGML_ASSERT(!are_lora_equal(current, next));
+
+    return (
+        !(lora_get_enabled_ids(current).empty() || lora_all_alora(current)) ||
+        !lora_all_alora(next));
+}
+
+std::map<int, float> parse_lora_request(const json & data) {
+    std::map<int, float> lora;
+
+    // set value
+    for (const auto & entry : data) {
+        int id      = json_value(entry, "id", -1);
+        float scale = json_value(entry, "scale", 0.0f);
+        lora[id] = scale;
+    }
+
+    return lora;
+}
+
+bool are_lora_equal(
+        const std::vector<common_adapter_lora_info> & l1,
+        const std::vector<common_adapter_lora_info> & l2) {
+    if (l1.size() != l2.size()) {
+        return false;
+    }
+    for (size_t i = 0; i < l1.size(); ++i) {
+        // we don't check lora.path to reduce the time complexity
+        if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) {
+            return false;
+        }
+    }
+    return true;
+}
+
+std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras) {
+    std::vector<size_t> enabled_ids;
+    for (size_t i = 0; i < loras.size(); ++i) {
+        if (loras[i].scale > 0) {
+            enabled_ids.push_back(i);
+        }
+    }
+    return enabled_ids;
+}
+
+//
+// base64 utils (TODO: use the base64::decode from base64.hpp)
+//
+
+static const std::string base64_chars =
+             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+             "abcdefghijklmnopqrstuvwxyz"
+             "0123456789+/";
+
+static inline bool is_base64(uint8_t c) {
+    return (isalnum(c) || (c == '+') || (c == '/'));
+}
+
+static inline raw_buffer base64_decode(const std::string & encoded_string) {
+    int i = 0;
+    int j = 0;
+    int in_ = 0;
+
+    int in_len = encoded_string.size();
+
+    uint8_t char_array_4[4];
+    uint8_t char_array_3[3];
+
+    raw_buffer ret;
+
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
+        char_array_4[i++] = encoded_string[in_]; in_++;
+        if (i == 4) {
+            for (i = 0; i < 4; i++) {
+                char_array_4[i] = base64_chars.find(char_array_4[i]);
+            }
+
+            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+            for (i = 0; (i < 3); i++) {
+                ret.push_back(char_array_3[i]);
+            }
+
+            i = 0;
+        }
+    }
+
+    if (i) {
+        for (j = i; j < 4; j++) {
+            char_array_4[j] = 0;
+        }
+
+        for (j = 0; j < 4; j++) {
+            char_array_4[j] = base64_chars.find(char_array_4[j]);
+        }
+
+        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+        for (j = 0; j < i - 1; j++) {
+            ret.push_back(char_array_3[j]);
+        }
+    }
+
+    return ret;
+}
+
+//
+// server_tokens implementation
+//
+
+server_tokens::server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
+    for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
+        push_back(mtmd_chunks[i]);
+    }
+}
+
+server_tokens::server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {
+}
+
+llama_pos server_tokens::pos_next() const {
+    if (!has_mtmd) {
+        return tokens.size();
+    }
+
+    llama_pos res = tokens.size();
+
+    for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) {
+        const auto & chunk = it->second;
+        res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get());
+    }
+
+    return res;
+}
+
+std::string server_tokens::str() const {
+    std::ostringstream oss;
+    oss << "tokens: ";
+    for (size_t idx = 0; idx < tokens.size(); ++idx) {
+        llama_token t = tokens[idx];
+        oss << "idx:" << idx << " ";
+        if (t == LLAMA_TOKEN_NULL) {
+            oss << "<embd> ";
+        } else {
+            oss << t << " ";
+        }
+    }
+    oss << "\n";
+    oss << "image idx: ";
+    for (const auto & it : map_idx_to_media) {
+        oss << it.first << ", ";
+    }
+    return oss.str();
+}
+
+const mtmd::input_chunk_ptr & server_tokens::find_chunk(size_t idx) const {
+    auto it = map_idx_to_media.find(idx);
+    if (it != map_idx_to_media.end()) {
+        return it->second;
+    }
+    throw std::runtime_error("Chunk not found");
+}
+
+void server_tokens::push_back(llama_token tok) {
+    if (tok == LLAMA_TOKEN_NULL) {
+        throw std::runtime_error("Invalid token");
+    }
+    tokens.emplace_back(tok);
+}
+
+void server_tokens::push_back(const mtmd_input_chunk * chunk) {
+    auto type = mtmd_input_chunk_get_type(chunk);
+    if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        GGML_ASSERT(has_mtmd);
+        const size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
+        size_t start_idx = tokens.size();
+        for (size_t i = 0; i < n_tokens; ++i) {
+            tokens.emplace_back(LLAMA_TOKEN_NULL);
+        }
+        mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
+        map_idx_to_media[start_idx] = std::move(new_chunk);
+    } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        size_t n_tokens;
+        const auto * text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+        for (size_t i = 0; i < n_tokens; ++i) {
+            push_back(text_tokens[i]);
+        }
+    } else {
+        GGML_ABORT("Invalid chunk type");
+    }
+}
+
+void server_tokens::push_back(server_tokens & tokens) {
+    size_t start_idx = size();
+    for (size_t i = 0; i < tokens.size(); i++) {
+        push_back(tokens[i]);
+    }
+    if (tokens.has_mtmd) {
+        // Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd.
+        // We could also just check, but this will prevent silently dropping MTMD data.
+        GGML_ASSERT(has_mtmd);
+        for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) {
+            auto * chunk = tokens.map_idx_to_media[it->first].get();
+            mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
+            map_idx_to_media[start_idx + it->first] = std::move(new_chunk);
+        }
+    }
+}
+
+void server_tokens::insert(const llama_tokens & inp_tokens) {
+    GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+    tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
+}
+
+const llama_tokens & server_tokens::get_text_tokens() const {
+    GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+    return tokens;
+}
+
+void server_tokens::set_token(llama_pos pos, llama_token id) {
+    GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+    tokens[pos] = id;
+}
+
+void server_tokens::keep_first(size_t n) {
+    GGML_ASSERT(n <= tokens.size());
+    if (has_mtmd) {
+        if (n == tokens.size()) {
+            return; // nothing to do
+        }
+        // we throw an error if we try to remove a token in the middle of an image
+        // for ex. with input of 5 text tokens and 2 images:
+        //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
+        // n  1   2   3   4   5   6      7      8      9      10
+        // allowed to resize      ^                    ^
+        // disallowed to resize          ^      ^             ^
+        if (n > 0) {
+            // make sure we never remove tokens in the middle of an image
+            // note that the case where we keep a full image at the end is allowed:
+            //   tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] != LLAMA_TOKEN_NULL
+            if (tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] == LLAMA_TOKEN_NULL) {
+                find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
+            }
+        }
+        // remove all image chunks that are not used anymore
+        for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ) {
+            size_t idx = it->first;
+            if (idx >= n) {
+                it = map_idx_to_media.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+    tokens.resize(n);
+}
+
+std::string server_tokens::detokenize(const llama_context * ctx, bool special) const {
+    llama_tokens text_tokens;
+    text_tokens.reserve(tokens.size());
+    for (const auto & t : tokens) {
+        if (t != LLAMA_TOKEN_NULL) {
+            text_tokens.push_back(t);
+        }
+    }
+    return common_detokenize(ctx, text_tokens, special);
+}
+
+size_t server_tokens::get_common_prefix(const server_tokens & b) const {
+    const size_t max_idx = std::min(tokens.size(), b.tokens.size());
+
+    if (!has_mtmd) {
+        for (size_t i = 0; i < max_idx; ++i) {
+            if (tokens[i] == b.tokens[i]) {
+                continue;
+            }
+
+            return i;
+        }
+
+        return max_idx;
+    }
+
+    for (size_t i = 0; i < max_idx; ++i) {
+        const llama_token ai =   tokens[i];
+        const llama_token bi = b.tokens[i];
+
+        if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
+            const auto & a_chunk =   find_chunk(i);
+            const auto & b_chunk = b.find_chunk(i);
+
+            GGML_ASSERT(a_chunk && b_chunk);
+
+            const std::string id_ai = mtmd_input_chunk_get_id(a_chunk.get());
+            const std::string id_bi = mtmd_input_chunk_get_id(b_chunk.get());
+
+            const size_t n_tok_a = mtmd_input_chunk_get_n_tokens(a_chunk.get());
+            const size_t n_tok_b = mtmd_input_chunk_get_n_tokens(b_chunk.get());
+
+            if (id_ai == id_bi && n_tok_a == n_tok_b) {
+                GGML_ASSERT(n_tok_a > 0 && "Invalid media chunk"); // should never happen
+                i += n_tok_a - 1; // will be +1 by the for loop
+                continue;
+            }
+
+            return i;
+        }
+
+        if (ai == bi) {
+            continue;
+        }
+
+        return i;
+    }
+
+    return max_idx; // all tokens are equal
+}
+
+bool server_tokens::validate(const struct llama_context * ctx) const {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    const int32_t n_vocab = llama_vocab_n_tokens(vocab);
+
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        const auto & t = tokens[i];
+        if (t == LLAMA_TOKEN_NULL) {
+            try {
+                const auto & chunk = find_chunk(i);
+                size_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk.get());
+                i += n_tokens - 1; // will be +1 by the for loop
+            } catch (const std::exception & e) {
+                return false;
+            }
+        } else if (t < 0 || t >= n_vocab) {
+            return false;
+        }
+    }
+    return true;
+}
+
+int32_t server_tokens::process_chunk(
+            llama_context * ctx,
+            mtmd_context * mctx,
+            size_t idx,
+            llama_pos pos,
+            int32_t seq_id,
+            size_t & n_tokens_out) const {
+    const auto & chunk = find_chunk(idx);
+    const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
+                        ? "image" : "audio";
+    SRV_INF("processing %s...\n", name);
+    int32_t n_batch = llama_n_batch(ctx);
+    int64_t t0 = ggml_time_ms();
+    llama_pos new_n_past; // unused for now
+    int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
+        chunk.get(),
+        pos,
+        seq_id,
+        n_batch,
+        true, // logits last
+        &new_n_past);
+    SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
+    if (result != 0) {
+        LOG_ERR("mtmd_helper_eval failed with status %d", result);
+        n_tokens_out = 0;
+        return result;
+    }
+    n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get());
+    return 0;
+}
+
+server_tokens server_tokens::clone() const {
+    server_tokens res;
+    res.has_mtmd = has_mtmd;
+    res.tokens   = tokens;
+    for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) {
+        size_t idx = it->first;
+        const mtmd::input_chunk_ptr & chunk = it->second;
+        res.map_idx_to_media[idx] = mtmd::input_chunk_ptr(mtmd_input_chunk_copy(chunk.get()));
+    }
+    return res;
+}
+
+//
+// tokenizer and input processing utils
+//
+
+bool json_is_array_of_numbers(const json & data) {
+    if (data.is_array()) {
+        for (const auto & e : data) {
+            if (!e.is_number_integer()) {
+                return false;
+            }
+        }
+        return true;
+    }
+    return false;
+}
+
+bool json_is_array_of_mixed_numbers_strings(const json & data) {
+    bool seen_string = false;
+    bool seen_number = false;
+    if (data.is_array()) {
+        for (const auto & e : data) {
+            seen_string |= e.is_string();
+            seen_number |= e.is_number_integer();
+            if (seen_number && seen_string) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+bool json_is_array_and_contains_numbers(const json & data) {
+    if (data.is_array()) {
+        for (const auto & e : data) {
+            if (e.is_number_integer()) {
+                return true;
+            }
+        }
+        return false;
+    }
+    return false;
+}
+
+json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
+    json result = json::object();
+
+    for (const std::string & path : paths) {
+        json current = js;
+        const auto keys = string_split<std::string>(path, /*separator*/ '/');
+        bool valid_path = true;
+        for (const std::string & k : keys) {
+            if (valid_path && current.is_object() && current.contains(k)) {
+                current = current[k];
+            } else {
+                valid_path = false;
+            }
+        }
+        if (valid_path) {
+            result[path] = current;
+        }
+    }
+    return result;
+}
+
+llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
+    // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+    // or the first element of the json_prompt array is a string.
+    llama_tokens prompt_tokens;
+
+    if (json_prompt.is_array()) {
+        bool first = true;
+        for (const auto & p : json_prompt) {
+            if (p.is_string()) {
+                auto s = p.template get<std::string>();
+
+                llama_tokens p;
+                if (first) {
+                    p = common_tokenize(vocab, s, add_special, parse_special);
+                    first = false;
+                } else {
+                    p = common_tokenize(vocab, s, false, parse_special);
+                }
+
+                prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+            } else {
+                if (first) {
+                    first = false;
+                }
+
+                prompt_tokens.push_back(p.template get<llama_token>());
+            }
+        }
+    } else {
+        auto s = json_prompt.template get<std::string>();
+        prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
+    }
+
+    return prompt_tokens;
+}
+
+size_t validate_utf8(const std::string& text) {
+    size_t len = text.size();
+    if (len == 0) return 0;
+
+    // Check the last few bytes to see if a multi-byte character is cut off
+    for (size_t i = 1; i <= 4 && i <= len; ++i) {
+        unsigned char c = text[len - i];
+        // Check for start of a multi-byte sequence from the end
+        if ((c & 0xE0) == 0xC0) {
+            // 2-byte character start: 110xxxxx
+            // Needs at least 2 bytes
+            if (i < 2) return len - i;
+        } else if ((c & 0xF0) == 0xE0) {
+            // 3-byte character start: 1110xxxx
+            // Needs at least 3 bytes
+            if (i < 3) return len - i;
+        } else if ((c & 0xF8) == 0xF0) {
+            // 4-byte character start: 11110xxx
+            // Needs at least 4 bytes
+            if (i < 4) return len - i;
+        }
+    }
+
+    // If no cut-off multi-byte character is found, return full length
+    return len;
+}
+
+// Computes FNV-1a hash of the data
+static std::string fnv_hash(const uint8_t * data, size_t len) {
+    const uint64_t fnv_prime = 0x100000001b3ULL;
+    uint64_t hash = 0xcbf29ce484222325ULL;
+
+    for (size_t i = 0; i < len; ++i) {
+        hash ^= data[i];
+        hash *= fnv_prime;
+    }
+    return std::to_string(hash);
+}
+
+server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
+    mtmd::bitmaps bitmaps;
+    for (auto & file : files) {
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
+        if (!bmp.ptr) {
+            throw std::runtime_error("Failed to load image or audio file");
+        }
+        // calculate bitmap hash (for KV caching)
+        std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
+        bmp.set_id(hash.c_str());
+        bitmaps.entries.push_back(std::move(bmp));
+    }
+    // process prompt
+    std::vector<server_tokens> inputs;
+    // multimodal
+    mtmd_input_text inp_txt = {
+        prompt.c_str(),
+        /* add_special */   true,
+        /* parse_special */ true,
+    };
+    mtmd::input_chunks chunks(mtmd_input_chunks_init());
+    auto bitmaps_c_ptr = bitmaps.c_ptr();
+    int32_t tokenized = mtmd_tokenize(mctx,
+                                      chunks.ptr.get(),
+                                      &inp_txt,
+                                      bitmaps_c_ptr.data(),
+                                      bitmaps_c_ptr.size());
+    if (tokenized != 0) {
+        throw std::runtime_error("Failed to tokenize prompt");
+    }
+    auto result = server_tokens(chunks, true);
+    return result;
+}
+
+/**
+ * break the input "prompt" object into multiple prompt if needed, then tokenize them
+ * use tokenize_input_prompts() if the input could be an array.
+ * this supports these cases:
+ * - "prompt": "string"
+ * - "prompt": [12, 34, 56]
+ * - "prompt": [12, 34, "string", 56, 78]
+ * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
+ */
+static server_tokens tokenize_input_subprompt(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) {
+    constexpr char JSON_STRING_PROMPT_KEY[] = "prompt_string";
+    constexpr char JSON_MTMD_DATA_KEY[] = "multimodal_data";
+    const bool has_mtmd = mctx != nullptr;
+    if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
+        // string or mixed
+        llama_tokens tmp = tokenize_mixed(vocab, json_prompt, add_special, parse_special);
+        return server_tokens(tmp, false);
+    } else if (json_is_array_of_numbers(json_prompt)) {
+        // array of tokens
+        llama_tokens tmp = json_prompt.get<llama_tokens>();
+        return server_tokens(tmp, false);
+    } else if (json_prompt.contains(JSON_STRING_PROMPT_KEY)) {
+        // JSON object with prompt key.
+        if (json_prompt.contains(JSON_MTMD_DATA_KEY)) {
+            if (!has_mtmd)
+                throw std::runtime_error("Multimodal data provided, but model does not support multimodal requests.");
+
+            // JSON object with prompt and multimodal key.
+            std::vector<raw_buffer> files;
+            for (const auto & entry : json_prompt.at(JSON_MTMD_DATA_KEY)) {
+                files.push_back(base64_decode(entry));
+            }
+            return process_mtmd_prompt(mctx, json_prompt.at(JSON_STRING_PROMPT_KEY), files);
+        } else {
+            // Not multimodal, but contains a subobject.
+            llama_tokens tmp = tokenize_mixed(vocab, json_prompt.at(JSON_STRING_PROMPT_KEY), add_special, parse_special);
+            return server_tokens(tmp, false);
+        }
+   } else {
+       throw std::runtime_error("\"prompt\" elements must be a string, a list of tokens, a JSON object containing a prompt string, or a list of mixed strings & tokens.");
+   }
+}
+
+std::vector<server_tokens> tokenize_input_prompts(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) {
+    std::vector<server_tokens> result;
+    if (json_prompt.is_array() && !json_is_array_and_contains_numbers(json_prompt)) {
+        result.reserve(json_prompt.size());
+        for (const auto & p : json_prompt) {
+            result.push_back(tokenize_input_subprompt(vocab, mctx, p,add_special, parse_special));
+        }
+    } else {
+        result.push_back(tokenize_input_subprompt(vocab, mctx, json_prompt, add_special, parse_special));
+    }
+    if (result.empty()) {
+        throw std::runtime_error("\"prompt\" must not be empty");
+    }
+    return result;
+}
+
+//
+// OAI utils
+//
+
+// used by /completions endpoint
+json oaicompat_completion_params_parse(const json & body) {
+    json llama_params;
+
+    if (!body.contains("prompt")) {
+        throw std::runtime_error("\"prompt\" is required");
+    }
+
+    // Handle "stop" field
+    if (body.contains("stop") && body.at("stop").is_string()) {
+        llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
+    } else {
+        llama_params["stop"] = json_value(body, "stop", json::array());
+    }
+
+    // Handle "echo" field
+    if (json_value(body, "echo", false)) {
+        throw std::runtime_error("Only no echo is supported");
+    }
+
+    // Params supported by OAI but unsupported by llama.cpp
+    static const std::vector<std::string> unsupported_params { "best_of", "suffix" };
+    for (const auto & param : unsupported_params) {
+        if (body.contains(param)) {
+            throw std::runtime_error("Unsupported param: " + param);
+        }
+    }
+
+    // Copy remaining properties to llama_params
+    for (const auto & item : body.items()) {
+        // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
+        if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
+            llama_params[item.key()] = item.value();
+        }
+    }
+
+    return llama_params;
+}
+
+// media_path always end with '/', see arg.cpp
+static void handle_media(
+        std::vector<raw_buffer> & out_files,
+        json & media_obj,
+        const std::string & media_path) {
+    std::string url = json_value(media_obj, "url", std::string());
+    if (string_starts_with(url, "http")) {
+        // download remote image
+        // TODO @ngxson : maybe make these params configurable
+        common_remote_params params;
+        params.max_size = 1024 * 1024 * 10; // 10MB
+        params.timeout  = 10; // seconds
+        SRV_INF("downloading image from '%s'\n", url.c_str());
+        auto res = common_remote_get_content(url, params);
+        if (200 <= res.first && res.first < 300) {
+            SRV_INF("downloaded %zu bytes\n", res.second.size());
+            raw_buffer data;
+            data.insert(data.end(), res.second.begin(), res.second.end());
+            out_files.push_back(data);
+        } else {
+            throw std::runtime_error("Failed to download image");
+        }
+
+    } else if (string_starts_with(url, "file://")) {
+        if (media_path.empty()) {
+            throw std::invalid_argument("file:// URLs are not allowed unless --media-path is specified");
+        }
+        // load local image file
+        std::string file_path = url.substr(7); // remove "file://"
+        raw_buffer data;
+        if (!fs_validate_filename(file_path, true)) {
+            throw std::invalid_argument("file path is not allowed: " + file_path);
+        }
+        SRV_INF("loading image from local file '%s'\n", (media_path + file_path).c_str());
+        std::ifstream file(media_path + file_path, std::ios::binary);
+        if (!file) {
+            throw std::invalid_argument("file does not exist or cannot be opened: " + file_path);
+        }
+        data.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+        out_files.push_back(data);
+
+    } else {
+        // try to decode base64 image
+        std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
+        if (parts.size() != 2) {
+            throw std::runtime_error("Invalid url value");
+        } else if (!string_starts_with(parts[0], "data:image/")) {
+            throw std::runtime_error("Invalid url format: " + parts[0]);
+        } else if (!string_ends_with(parts[0], "base64")) {
+            throw std::runtime_error("url must be base64 encoded");
+        } else {
+            auto base64_data = parts[1];
+            auto decoded_data = base64_decode(base64_data);
+            out_files.push_back(decoded_data);
+        }
+    }
+}
+
+// used by /chat/completions endpoint
+json oaicompat_chat_params_parse(
+    json & body, /* openai api json semantics */
+    const server_chat_params & opt,
+    std::vector<raw_buffer> & out_files)
+{
+    json llama_params;
+
+    auto tools = json_value(body, "tools", json());
+    auto has_tools = tools.is_array() && !tools.empty();
+    auto stream = json_value(body, "stream", false);
+    auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
+
+    if (!opt.use_jinja) {
+        if (has_tools) {
+            throw std::runtime_error("tools param requires --jinja flag");
+        }
+        if (tool_choice != "auto") {
+            throw std::runtime_error("tool_choice param requires --jinja flag");
+        }
+    }
+
+    // Handle "stop" field
+    if (body.contains("stop") && body.at("stop").is_string()) {
+        llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
+    } else {
+        llama_params["stop"] = json_value(body, "stop", json::array());
+    }
+
+    auto json_schema = json_value(body, "json_schema", json());
+    auto grammar = json_value(body, "grammar", std::string());
+    if (!json_schema.is_null() && !grammar.empty()) {
+        throw std::runtime_error("Cannot use both json_schema and grammar");
+    }
+
+    // Handle "response_format" field
+    if (body.contains("response_format")) {
+        json response_format      = json_value(body, "response_format", json::object());
+        std::string response_type = json_value(response_format, "type", std::string());
+        if (response_type == "json_object") {
+            json_schema = json_value(response_format, "schema", json::object());
+        } else if (response_type == "json_schema") {
+            auto schema_wrapper = json_value(response_format, "json_schema", json::object());
+            json_schema = json_value(schema_wrapper, "schema", json::object());
+        } else if (!response_type.empty() && response_type != "text") {
+            throw std::invalid_argument("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
+        }
+    }
+
+    // get input files
+    if (!body.contains("messages")) {
+        throw std::invalid_argument("'messages' is required");
+    }
+    json & messages = body.at("messages");
+    if (!messages.is_array()) {
+        throw std::invalid_argument("Expected 'messages' to be an array");
+    }
+    for (auto & msg : messages) {
+        std::string role = json_value(msg, "role", std::string());
+        if (role != "assistant" && !msg.contains("content")) {
+            throw std::invalid_argument("All non-assistant messages must contain 'content'");
+        }
+        if (role == "assistant") {
+            if (!msg.contains("content") && !msg.contains("tool_calls")) {
+                throw std::invalid_argument("Assistant message must contain either 'content' or 'tool_calls'!");
+            }
+            if (!msg.contains("content")) {
+                continue; // avoid errors with no content
+            }
+        }
+        json & content = msg.at("content");
+        if (content.is_string() || content.is_null()) {
+            continue;
+        }
+
+        if (!content.is_array()) {
+            throw std::invalid_argument("Expected 'content' to be a string or an array");
+        }
+
+        for (auto & p : content) {
+            std::string type      = json_value(p, "type", std::string());
+            if (type == "image_url") {
+                if (!opt.allow_image) {
+                    throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
+                }
+
+                json image_url = json_value(p, "image_url", json::object());
+                handle_media(out_files, image_url, opt.media_path);
+
+                // replace this chunk with a marker
+                p["type"] = "text";
+                p["text"] = mtmd_default_marker();
+                p.erase("image_url");
+
+            } else if (type == "input_audio") {
+                if (!opt.allow_audio) {
+                    throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
+                }
+
+                json input_audio   = json_value(p, "input_audio", json::object());
+                std::string data   = json_value(input_audio, "data", std::string());
+                std::string format = json_value(input_audio, "format", std::string());
+                // while we also support flac, we don't allow it here so we matches the OAI spec
+                if (format != "wav" && format != "mp3") {
+                    throw std::invalid_argument("input_audio.format must be either 'wav' or 'mp3'");
+                }
+                auto decoded_data = base64_decode(data); // expected to be base64 encoded
+                out_files.push_back(decoded_data);
+
+                // TODO: add audio_url support by reusing handle_media()
+
+                // replace this chunk with a marker
+                p["type"] = "text";
+                p["text"] = mtmd_default_marker();
+                p.erase("input_audio");
+
+            } else if (type != "text") {
+                throw std::invalid_argument("unsupported content[].type");
+            }
+        }
+    }
+
+    common_chat_templates_inputs inputs;
+    inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
+    inputs.tools                 = common_chat_tools_parse_oaicompat(tools);
+    inputs.tool_choice           = common_chat_tool_choice_parse_oaicompat(tool_choice);
+    inputs.json_schema           = json_schema.is_null() ? "" : json_schema.dump();
+    inputs.grammar               = grammar;
+    inputs.use_jinja             = opt.use_jinja;
+    inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
+    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
+    inputs.reasoning_format      = opt.reasoning_format;
+    if (body.contains("reasoning_format")) {
+        inputs.reasoning_format = common_reasoning_format_from_name(body.at("reasoning_format").get<std::string>());
+    }
+    inputs.enable_thinking       = opt.enable_thinking;
+    if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
+        if (body.contains("grammar")) {
+            throw std::invalid_argument("Cannot use custom grammar constraints with tools.");
+        }
+        llama_params["parse_tool_calls"] = true;
+    }
+
+    // merge the template args provided from command line with the args provided in the user request
+    auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object());
+    inputs.chat_template_kwargs = opt.chat_template_kwargs;
+    for (const auto & item : chat_template_kwargs_object.items()) {
+        inputs.chat_template_kwargs[item.key()] = item.value().dump();
+    }
+
+    // parse the "enable_thinking" kwarg to override the default value
+    auto enable_thinking_kwarg = json_value(inputs.chat_template_kwargs, "enable_thinking", std::string(""));
+    if (enable_thinking_kwarg == "true") {
+        inputs.enable_thinking = true;
+    } else if (enable_thinking_kwarg == "false") {
+        inputs.enable_thinking = false;
+    } else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') {
+        throw std::invalid_argument("invalid type for \"enable_thinking\" (expected boolean, got string)");
+    }
+
+    // if the assistant message appears at the end of list, we do not add end-of-turn token
+    // for ex. this can be useful to modify the reasoning process in reasoning models
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
+    common_chat_msg last_message;
+    if (prefill_assistant_message) {
+        last_message = inputs.messages.back();
+        inputs.messages.pop_back();
+
+        /* sanity check, max one assistant message at the end of the list */
+        if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
+            throw std::invalid_argument("Cannot have 2 or more assistant messages at the end of the list.");
+        }
+
+        /* TODO: test this properly */
+        inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
+
+        if ( inputs.enable_thinking ) {
+            throw std::invalid_argument("Assistant response prefill is incompatible with enable_thinking.");
+        }
+
+        inputs.add_generation_prompt = true;
+    }
+
+    // Apply chat template to the list of messages
+    auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs);
+
+    /* Append assistant prefilled message */
+    if (prefill_assistant_message) {
+        if (!last_message.content_parts.empty()) {
+            for (auto & p : last_message.content_parts) {
+                chat_params.prompt += p.text;
+            }
+        } else {
+            chat_params.prompt += last_message.content;
+        }
+    }
+
+    llama_params["chat_format"]      = static_cast<int>(chat_params.format);
+    llama_params["prompt"]           = chat_params.prompt;
+    if (!chat_params.grammar.empty()) {
+        llama_params["grammar"] = chat_params.grammar;
+    }
+    llama_params["grammar_lazy"]     = chat_params.grammar_lazy;
+    auto grammar_triggers = json::array();
+    for (const auto & trigger : chat_params.grammar_triggers) {
+        server_grammar_trigger ct(trigger);
+        grammar_triggers.push_back(ct.to_json());
+    }
+    llama_params["grammar_triggers"] = grammar_triggers;
+    llama_params["preserved_tokens"] = chat_params.preserved_tokens;
+    llama_params["thinking_forced_open"]     = chat_params.thinking_forced_open;
+    for (const auto & stop : chat_params.additional_stops) {
+        llama_params["stop"].push_back(stop);
+    }
+    if (!chat_params.parser.empty()) {
+        llama_params["chat_parser"] = chat_params.parser;
+    }
+
+    // Handle "logprobs" field
+    // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
+    if (json_value(body, "logprobs", false)) {
+        if (has_tools && stream) {
+            throw std::invalid_argument("logprobs is not supported with tools + stream");
+        }
+        llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
+    } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
+        throw std::invalid_argument("top_logprobs requires logprobs to be set to true");
+    }
+
+    // Copy remaining properties to llama_params
+    // This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
+    // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
+    for (const auto & item : body.items()) {
+        // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
+        if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
+            llama_params[item.key()] = item.value();
+        }
+    }
+
+    return llama_params;
+}
+
+json convert_responses_to_chatcmpl(const json & response_body) {
+    if (!response_body.contains("input")) {
+        throw std::invalid_argument("'input' is required");
+    }
+    if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
+        throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
+    }
+
+    const json input_value = response_body.at("input");
+    json chatcmpl_body = response_body;
+    chatcmpl_body.erase("input");
+    std::vector<json> chatcmpl_messages;
+
+    if (response_body.contains("instructions")) {
+        chatcmpl_messages.push_back({
+            {"role",    "system"},
+            {"content", json_value(response_body, "instructions", std::string())},
+        });
+        chatcmpl_body.erase("instructions");
+    }
+
+    if (input_value.is_string()) {
+        // #responses_create-input-text_input
+        chatcmpl_messages.push_back({
+            {"role",    "user"},
+            {"content", input_value},
+        });
+    } else if (input_value.is_array()) {
+        // #responses_create-input-input_item_list
+
+        static auto exists_and_is_array = [](const json & j, const char * key) -> bool {
+            return j.contains(key) && j.at(key).is_array();
+        };
+        static auto exists_and_is_string = [](const json & j, const char * key) -> bool {
+            return j.contains(key) && j.at(key).is_string();
+        };
+
+        for (json item : input_value) {
+            if (exists_and_is_string(item, "content")) {
+                // #responses_create-input-input_item_list-input_message-content-text_input
+                // Only "Input message" contains item["content"]::string
+                // After converting item["content"]::string to item["content"]::array,
+                // we can treat "Input message" as sum of "Item-Input message" and "Item-Output message"
+                item["content"] = json::array({
+                    json {
+                        {"text", item.at("content")},
+                        {"type", "input_text"}
+                    }
+                });
+            }
+
+            if (exists_and_is_array(item, "content") &&
+                exists_and_is_string(item, "role") &&
+                (item.at("role") == "user" ||
+                    item.at("role") == "system" ||
+                    item.at("role") == "developer")
+            ) {
+                // #responses_create-input-input_item_list-item-input_message
+                std::vector<json> chatcmpl_content;
+
+                for (const json & input_item : item.at("content")) {
+                    const std::string type = json_value(input_item, "type", std::string());
+
+                    if (type == "input_text") {
+                        if (!input_item.contains("text")) {
+                            throw std::invalid_argument("'Input text' requires 'text'");
+                        }
+                        chatcmpl_content.push_back({
+                            {"text", input_item.at("text")},
+                            {"type", "text"},
+                        });
+                    } else if (type == "input_image") {
+                        // While `detail` is marked as required,
+                        // it has default value("auto") and can be omitted.
+
+                        if (!input_item.contains("image_url")) {
+                            throw std::invalid_argument("'image_url' is required");
+                        }
+                        chatcmpl_content.push_back({
+                            {"image_url", json {
+                                {"url", input_item.at("image_url")}
+                            }},
+                            {"type", "image_url"},
+                        });
+                    } else if (type == "input_file") {
+                        throw std::invalid_argument("'input_file' is not supported by llamacpp at this moment");
+                        // if (input_item.contains("file_url")) {
+                        //     // chat completion API does not support file_url
+                        //     throw std::invalid_argument("'file_url' is not supported");
+                        // }
+                        // if (!input_item.contains("file_data") || !input_item.contains("filename")) {
+                        //     throw std::invalid_argument("Both 'file_data' and 'filename' are required");
+                        // }
+                        // chatcmpl_content.push_back({
+                        //     {"file", json {
+                        //         {"file_data", input_item.at("file_data")},
+                        //         {"filename",  input_item.at("filename")},
+                        //     }},
+                        //     {"type", "file"},
+                        // });
+                    } else {
+                        throw std::invalid_argument("'type' must be one of 'input_text', 'input_image', or 'input_file'");
+                    }
+                }
+
+                if (item.contains("type")) {
+                    item.erase("type");
+                }
+                if (item.contains("status")) {
+                    item.erase("status");
+                }
+                item["content"] = chatcmpl_content;
+
+                chatcmpl_messages.push_back(item);
+            } else if (exists_and_is_array(item, "content") &&
+                exists_and_is_string(item, "role") &&
+                item.at("role") == "assistant" &&
+                // exists_and_is_string(item, "status") &&
+                // (item.at("status") == "in_progress" ||
+                //     item.at("status") == "completed" ||
+                //     item.at("status") == "incomplete") &&
+                // item["status"] not sent by codex-cli
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "message"
+            ) {
+                // #responses_create-input-input_item_list-item-output_message
+                std::vector<json> chatcmpl_content;
+
+                for (const auto & output_text : item.at("content")) {
+                    const std::string type = json_value(output_text, "type", std::string());
+                    if (type != "output_text") {
+                        throw std::invalid_argument("'type' must be 'output_text'");
+                    }
+                    if (!exists_and_is_string(output_text, "text")) {
+                        throw std::invalid_argument("'Output text' requires 'text'");
+                    }
+                    // Ignore annotations and logprobs for now
+                    chatcmpl_content.push_back({
+                        {"text", output_text.at("text")},
+                        {"type", "text"},
+                    });
+                }
+
+                item.erase("status");
+                item.erase("type");
+                item["content"] = chatcmpl_content;
+                chatcmpl_messages.push_back(item);
+            } else if (exists_and_is_string(item, "arguments") &&
+                exists_and_is_string(item, "call_id") &&
+                exists_and_is_string(item, "name") &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "function_call"
+            ) {
+                // #responses_create-input-input_item_list-item-function_tool_call
+                json msg = json {
+                    {"role", "assistant"},
+                    {"tool_calls", json::array({ json {
+                        {"function", json {
+                            {"arguments", item.at("arguments")},
+                            {"name",      item.at("name")},
+                        }},
+                        {"id",   item.at("call_id")},
+                        {"type", "function"},
+                    }})},
+                };
+
+                if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) {
+                    // Move reasoning content from dummy message to tool call message
+                    msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content");
+                    chatcmpl_messages.pop_back();
+                }
+                chatcmpl_messages.push_back(msg);
+            } else if (exists_and_is_string(item, "call_id") &&
+                (exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "function_call_output"
+            ) {
+                // #responses_create-input-input_item_list-item-function_tool_call_output
+                if (item.at("output").is_string()) {
+                    chatcmpl_messages.push_back(json {
+                        {"content",      item.at("output")},
+                        {"role",         "tool"},
+                        {"tool_call_id", item.at("call_id")},
+                    });
+                } else {
+                    json chatcmpl_outputs = item.at("output");
+                    for (json & chatcmpl_output : chatcmpl_outputs) {
+                        if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
+                            throw std::invalid_argument("Output of tool call should be 'Input text'");
+                        }
+                        chatcmpl_output["type"] = "text";
+                    }
+                    chatcmpl_messages.push_back(json {
+                        {"content",      chatcmpl_outputs},
+                        {"role",         "tool"},
+                        {"tool_call_id", item.at("call_id")},
+                    });
+                }
+            } else if (// exists_and_is_string(item, "id") &&
+                // item["id"] not sent by codex-cli
+                exists_and_is_array(item, "summary") &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "reasoning") {
+                // #responses_create-input-input_item_list-item-reasoning
+
+                if (!exists_and_is_array(item, "content")) {
+                    throw std::invalid_argument("item['content'] is not an array");
+                }
+                if (item.at("content").empty()) {
+                    throw std::invalid_argument("item['content'] is empty");
+                }
+                if (!exists_and_is_string(item.at("content")[0], "text")) {
+                    throw std::invalid_argument("item['content']['text'] is not a string");
+                }
+
+                // Pack reasoning content in dummy message
+                chatcmpl_messages.push_back(json {
+                    {"role", "assistant"},
+                    {"content", json::array()},
+                    {"reasoning_content", item.at("content")[0].at("text")},
+                });
+            } else {
+                throw std::invalid_argument("Cannot determine type of 'item'");
+            }
+        }
+    } else {
+        throw std::invalid_argument("'input' must be a string or array of objects");
+    }
+
+    // Remove unused dummy message which contains
+    // reasoning content not followed by tool call
+    chatcmpl_messages.erase(std::remove_if(
+        chatcmpl_messages.begin(),
+        chatcmpl_messages.end(),
+        [](const json & x){ return x.contains("role") &&
+            x.at("role") == "assistant" &&
+            x.contains("content") &&
+            x.at("content") == json::array() &&
+            x.contains("reasoning_content");
+        }),
+        chatcmpl_messages.end()
+    );
+
+    chatcmpl_body["messages"] = chatcmpl_messages;
+
+    if (response_body.contains("tools")) {
+        if (!response_body.at("tools").is_array()) {
+            throw std::invalid_argument("'tools' must be an array of objects");
+        }
+        std::vector<json> chatcmpl_tools;
+        for (json resp_tool : response_body.at("tools")) {
+            json chatcmpl_tool;
+
+            if (json_value(resp_tool, "type", std::string()) != "function") {
+                throw std::invalid_argument("'type' of tool must be 'function'");
+            }
+            resp_tool.erase("type");
+            chatcmpl_tool["type"] = "function";
+
+            if (!resp_tool.contains("strict")) {
+                resp_tool["strict"] = true;
+            }
+            chatcmpl_tool["function"] = resp_tool;
+            chatcmpl_tools.push_back(chatcmpl_tool);
+        }
+        chatcmpl_body.erase("tools");
+        chatcmpl_body["tools"] = chatcmpl_tools;
+    }
+
+    if (response_body.contains("max_output_tokens")) {
+        chatcmpl_body.erase("max_output_tokens");
+        chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
+    }
+
+    return chatcmpl_body;
+}
+
+json convert_anthropic_to_oai(const json & body) {
+    json oai_body;
+
+    // Convert system prompt
+    json oai_messages = json::array();
+    auto system_param = json_value(body, "system", json());
+    if (!system_param.is_null()) {
+        std::string system_content;
+
+        if (system_param.is_string()) {
+            system_content = system_param.get<std::string>();
+        } else if (system_param.is_array()) {
+            for (const auto & block : system_param) {
+                if (json_value(block, "type", std::string()) == "text") {
+                    system_content += json_value(block, "text", std::string());
+                }
+            }
+        }
+
+        oai_messages.push_back({
+            {"role", "system"},
+            {"content", system_content}
+        });
+    }
+
+    // Convert messages
+    if (!body.contains("messages")) {
+        throw std::runtime_error("'messages' is required");
+    }
+    const json & messages = body.at("messages");
+    if (messages.is_array()) {
+        for (const auto & msg : messages) {
+            std::string role = json_value(msg, "role", std::string());
+
+            if (!msg.contains("content")) {
+                if (role == "assistant") {
+                    continue;
+                }
+                oai_messages.push_back(msg);
+                continue;
+            }
+
+            const json & content = msg.at("content");
+
+            if (content.is_string()) {
+                oai_messages.push_back(msg);
+                continue;
+            }
+
+            if (!content.is_array()) {
+                oai_messages.push_back(msg);
+                continue;
+            }
+
+            json tool_calls = json::array();
+            json converted_content = json::array();
+            json tool_results = json::array();
+            bool has_tool_calls = false;
+
+            for (const auto & block : content) {
+                std::string type = json_value(block, "type", std::string());
+
+                if (type == "text") {
+                    converted_content.push_back(block);
+                } else if (type == "image") {
+                    json source = json_value(block, "source", json::object());
+                    std::string source_type = json_value(source, "type", std::string());
+
+                    if (source_type == "base64") {
+                        std::string media_type = json_value(source, "media_type", std::string("image/jpeg"));
+                        std::string data = json_value(source, "data", std::string());
+                        std::ostringstream ss;
+                        ss << "data:" << media_type << ";base64," << data;
+
+                        converted_content.push_back({
+                            {"type", "image_url"},
+                            {"image_url", {
+                                {"url", ss.str()}
+                            }}
+                        });
+                    } else if (source_type == "url") {
+                        std::string url = json_value(source, "url", std::string());
+                        converted_content.push_back({
+                            {"type", "image_url"},
+                            {"image_url", {
+                                {"url", url}
+                            }}
+                        });
+                    }
+                } else if (type == "tool_use") {
+                    tool_calls.push_back({
+                        {"id", json_value(block, "id", std::string())},
+                        {"type", "function"},
+                        {"function", {
+                            {"name", json_value(block, "name", std::string())},
+                            {"arguments", json_value(block, "input", json::object()).dump()}
+                        }}
+                    });
+                    has_tool_calls = true;
+                } else if (type == "tool_result") {
+                    std::string tool_use_id = json_value(block, "tool_use_id", std::string());
+
+                    auto result_content = json_value(block, "content", json());
+                    std::string result_text;
+                    if (result_content.is_string()) {
+                        result_text = result_content.get<std::string>();
+                    } else if (result_content.is_array()) {
+                        for (const auto & c : result_content) {
+                            if (json_value(c, "type", std::string()) == "text") {
+                                result_text += json_value(c, "text", std::string());
+                            }
+                        }
+                    }
+
+                    tool_results.push_back({
+                        {"role", "tool"},
+                        {"tool_call_id", tool_use_id},
+                        {"content", result_text}
+                    });
+                }
+            }
+
+            if (!converted_content.empty() || has_tool_calls) {
+                json new_msg = {{"role", role}};
+                if (!converted_content.empty()) {
+                    new_msg["content"] = converted_content;
+                } else if (has_tool_calls) {
+                    new_msg["content"] = "";
+                }
+                if (!tool_calls.empty()) {
+                    new_msg["tool_calls"] = tool_calls;
+                }
+                oai_messages.push_back(new_msg);
+            }
+
+            for (const auto & tool_msg : tool_results) {
+                oai_messages.push_back(tool_msg);
+            }
+        }
+    }
+
+    oai_body["messages"] = oai_messages;
+
+    // Convert tools
+    if (body.contains("tools")) {
+        const json & tools = body.at("tools");
+        if (tools.is_array()) {
+            json oai_tools = json::array();
+            for (const auto & tool : tools) {
+                oai_tools.push_back({
+                    {"type", "function"},
+                    {"function", {
+                        {"name", json_value(tool, "name", std::string())},
+                        {"description", json_value(tool, "description", std::string())},
+                        {"parameters", tool.contains("input_schema") ? tool.at("input_schema") : json::object()}
+                    }}
+                });
+            }
+            oai_body["tools"] = oai_tools;
+        }
+    }
+
+    // Convert tool_choice
+    if (body.contains("tool_choice")) {
+        const json & tc = body.at("tool_choice");
+        if (tc.is_object()) {
+            std::string type = json_value(tc, "type", std::string());
+            if (type == "auto") {
+                oai_body["tool_choice"] = "auto";
+            } else if (type == "any" || type == "tool") {
+                oai_body["tool_choice"] = "required";
+            }
+        }
+    }
+
+    // Convert stop_sequences to stop
+    if (body.contains("stop_sequences")) {
+        oai_body["stop"] = body.at("stop_sequences");
+    }
+
+    // Handle max_tokens (required in Anthropic, but we're permissive)
+    if (body.contains("max_tokens")) {
+        oai_body["max_tokens"] = body.at("max_tokens");
+    } else {
+        oai_body["max_tokens"] = 4096;
+    }
+
+    // Pass through common params
+    for (const auto & key : {"temperature", "top_p", "top_k", "stream"}) {
+        if (body.contains(key)) {
+            oai_body[key] = body.at(key);
+        }
+    }
+
+    // Handle Anthropic-specific thinking param
+    if (body.contains("thinking")) {
+        json thinking = json_value(body, "thinking", json::object());
+        std::string thinking_type = json_value(thinking, "type", std::string());
+        if (thinking_type == "enabled") {
+            int budget_tokens = json_value(thinking, "budget_tokens", 10000);
+            oai_body["thinking_budget_tokens"] = budget_tokens;
+        }
+    }
+
+    // Handle Anthropic-specific metadata param
+    if (body.contains("metadata")) {
+        json metadata = json_value(body, "metadata", json::object());
+        std::string user_id = json_value(metadata, "user_id", std::string());
+        if (!user_id.empty()) {
+            oai_body["__metadata_user_id"] = user_id;
+        }
+    }
+
+    return oai_body;
+}
+
+json format_embeddings_response_oaicompat(
+        const json & request,
+        const std::string & model_name,
+        const json & embeddings,
+        bool use_base64) {
+    json data = json::array();
+    int32_t n_tokens = 0;
+    int i = 0;
+    for (const auto & elem : embeddings) {
+        json embedding_obj;
+
+        if (use_base64) {
+            const auto& vec = json_value(elem, "embedding", json::array()).get<std::vector<float>>();
+            const char* data_ptr = reinterpret_cast<const char*>(vec.data());
+            size_t data_size = vec.size() * sizeof(float);
+            embedding_obj = {
+                {"embedding", base64::encode(data_ptr, data_size)},
+                {"index", i++},
+                {"object", "embedding"},
+                {"encoding_format", "base64"}
+            };
+        } else {
+            embedding_obj = {
+                {"embedding", json_value(elem, "embedding", json::array())},
+                {"index", i++},
+                {"object", "embedding"}
+            };
+        }
+        data.push_back(embedding_obj);
+
+        n_tokens += json_value(elem, "tokens_evaluated", 0);
+    }
+
+    json res = json {
+        {"model", json_value(request, "model", model_name)},
+        {"object", "list"},
+        {"usage", json {
+            {"prompt_tokens", n_tokens},
+            {"total_tokens", n_tokens}
+        }},
+        {"data", data}
+    };
+
+    return res;
+}
+
+json format_response_rerank(
+        const json & request,
+        const std::string & model_name,
+        const json & ranks,
+        bool is_tei_format,
+        std::vector<std::string> & texts,
+        int top_n) {
+    int32_t n_tokens = 0;
+    bool return_text = is_tei_format && json_value(request, "return_text", false);
+    std::vector<json> elements; // Temporary vector to hold unsorted elements
+    std::string score_label = is_tei_format ? "score" : "relevance_score";
+    for (const auto & rank : ranks) {
+        int index = json_value(rank, "index", 0);
+        json elem = json{
+            {"index", index},
+            {score_label, json_value(rank, "score", 0.0)},
+        };
+        n_tokens += json_value(rank, "tokens_evaluated", 0);
+        if (return_text) {
+            elem["text"] = std::move(texts[index]);
+        }
+        elements.push_back(elem);
+    }
+
+    std::sort(elements.begin(), elements.end(), [score_label](const json& a, const json& b) {
+        return json_value(a, score_label, 0.0) > json_value(b, score_label, 0.0);
+    });
+
+    elements.resize(std::min(top_n, (int)elements.size()));
+    json results = elements;
+
+    if (is_tei_format) return results;
+
+    json res = json{
+        {"model", json_value(request, "model", model_name)},
+        {"object", "list"},
+        {"usage", json{
+            {"prompt_tokens", n_tokens},
+            {"total_tokens", n_tokens}
+        }},
+        {"results", results}
+    };
+
+    return res;
+}
+
+
+//
+// other utils
+//
+
+std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
+    std::vector<llama_token_data> cur;
+
+    const auto * logits = llama_get_logits_ith(ctx, idx);
+    const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
+
+    const int n_logits = llama_get_sampled_logits_count_ith(ctx, idx);
+
+    cur.resize(n_logits);
+    if (sampled_ids) {
+        for (int i = 0; i < n_logits; i++) {
+            cur[i] = llama_token_data{sampled_ids[i], logits[i], 0.0f};
+        }
+    } else {
+        for (llama_token token_id = 0; token_id < n_logits; token_id++) {
+            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+        }
+    }
+
+    // sort tokens by logits
+    std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) {
+        return a.logit > b.logit;
+    });
+
+    // apply softmax
+    float max_l = cur[0].logit;
+    float cum_sum = 0.0f;
+    for (size_t i = 0; i < cur.size(); ++i) {
+        float p = expf(cur[i].logit - max_l);
+        cur[i].p = p;
+        cum_sum += p;
+    }
+    for (size_t i = 0; i < cur.size(); ++i) {
+        cur[i].p /= cum_sum;
+    }
+
+    return cur;
+}
+
+std::string safe_json_to_str(const json & data) {
+    return data.dump(-1, ' ', false, json::error_handler_t::replace);
+}
+
+// TODO: reuse llama_detokenize
+template <class Iter>
+static std::string tokens_to_str(const llama_vocab * ctx, Iter begin, Iter end) {
+    std::string ret;
+    for (; begin != end; ++begin) {
+        ret += common_token_to_piece(ctx, *begin);
+    }
+
+    return ret;
+}
+
+std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens) {
+    auto model = llama_get_model(ctx);
+    return tokens_to_str(llama_model_get_vocab(model), tokens.begin(), tokens.end());
+}
+
+std::string tokens_to_str(const llama_vocab * vocab, const llama_tokens & tokens) {
+    return tokens_to_str(vocab, tokens.begin(), tokens.end());
+}
+
+// format incomplete utf-8 multibyte character for output
+std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
+    std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token);
+
+    // if the size is 1 and first bit is 1, meaning it's a partial character
+    //   (size > 1 meaning it's already a known token)
+    if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
+        std::stringstream ss;
+        ss << std::hex << (out[0] & 0xff);
+        std::string res(ss.str());
+        out = "byte: \\x" + res;
+    }
+
+    return out;
+}
+
+// format server-sent event (SSE), return the formatted string to send
+// note: if data is a json array, it will be sent as multiple events, one per item
+std::string format_oai_sse(const json & data) {
+    std::ostringstream ss;
+    auto send_single = [&ss](const json & data) {
+        ss << "data: " <<
+            safe_json_to_str(data) <<
+            "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
+    };
+
+    if (data.is_array()) {
+        for (const auto & item : data) {
+            send_single(item);
+        }
+    } else {
+        send_single(data);
+    }
+
+    return ss.str();
+}
+
+std::string format_oai_resp_sse(const json & data) {
+    std::ostringstream ss;
+    auto send_single = [&ss](const json & event_obj) {
+        ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
+        ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
+    };
+
+    if (data.is_array()) {
+        for (const auto & item : data) {
+            send_single(item);
+        }
+    } else {
+        send_single(data);
+    }
+
+    return ss.str();
+}
+
+std::string format_anthropic_sse(const json & data) {
+    std::ostringstream ss;
+
+    auto send_event = [&ss](const json & event_obj) {
+        if (event_obj.contains("event") && event_obj.contains("data")) {
+            ss << "event: " << event_obj.at("event").get<std::string>() << "\n";
+            ss << "data: " << safe_json_to_str(event_obj.at("data")) << "\n\n";
+        } else {
+            ss << "data: " << safe_json_to_str(event_obj) << "\n\n";
+        }
+    };
+
+    if (data.is_array()) {
+        for (const auto & event : data) {
+            send_event(event);
+        }
+    } else {
+        send_event(data);
+    }
+
+    return ss.str();
+}
+
+bool is_valid_utf8(const std::string & str) {
+    const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
+    const unsigned char* end = bytes + str.length();
+
+    while (bytes < end) {
+        if (*bytes <= 0x7F) {
+            // 1-byte sequence (0xxxxxxx)
+            bytes++;
+        } else if ((*bytes & 0xE0) == 0xC0) {
+            // 2-byte sequence (110xxxxx 10xxxxxx)
+            if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
+                return false;
+            bytes += 2;
+        } else if ((*bytes & 0xF0) == 0xE0) {
+            // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
+                return false;
+            bytes += 3;
+        } else if ((*bytes & 0xF8) == 0xF0) {
+            // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
+                (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
+                return false;
+            bytes += 4;
+        } else {
+            // Invalid UTF-8 lead byte
+            return false;
+        }
+    }
+
+    return true;
+}
+
+llama_tokens format_prompt_infill(
+        const llama_vocab * vocab,
+        const json & input_prefix,
+        const json & input_suffix,
+        const json & input_extra,
+        const int n_batch,
+        const int n_predict,
+        const int n_ctx,
+        const bool spm_infill,
+        const llama_tokens & tokens_prompt
+    ) {
+    // TODO: optimize this block by reducing memory allocations and movement
+
+    // use FIM repo-level pattern:
+    // ref: https://arxiv.org/pdf/2409.12186
+    //
+    // [FIM_REP]myproject
+    // [FIM_SEP]filename0
+    // extra chunk 0
+    // [FIM_SEP]filename1
+    // extra chunk 1
+    // ...
+    // [FIM_SEP]filename
+    // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
+    //
+    llama_tokens extra_tokens;
+    extra_tokens.reserve(n_ctx);
+
+    auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false);
+    auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false);
+
+    if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) {
+        // TODO: make project name an input
+        static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false);
+
+        extra_tokens.push_back(llama_vocab_fim_rep(vocab));
+        extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
+    }
+    for (const auto & chunk : input_extra) {
+        // { "text": string, "filename": string }
+        const std::string text     = json_value(chunk, "text",     std::string());
+        const std::string filename = json_value(chunk, "filename", std::string("tmp"));
+
+        if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
+            const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false);
+
+            extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
+            extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
+        } else {
+            // chunk separator in binary form to avoid confusing the AI
+            static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
+            static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false);
+
+            extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
+        }
+
+        const auto chunk_tokens = common_tokenize(vocab, text, false, false);
+        extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
+    }
+
+    if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
+        // TODO: current filename
+        static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false);
+
+        extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
+        extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
+    }
+
+    // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
+    const int n_prefix_take = std::min<int>(tokens_prefix.size(),                3*(n_batch/4));
+    const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
+
+    SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
+
+    // fill the rest of the context with extra chunks
+    const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
+
+    tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
+    tokens_suffix.resize(n_suffix_take);
+
+    tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab));
+    tokens_prefix.insert(tokens_prefix.end(),   tokens_prompt.begin(), tokens_prompt.end());
+    tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab));
+
+    auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
+    auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
+
+    if (llama_vocab_get_add_bos(vocab)) {
+        embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
+    }
+
+    SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
+
+    // put the extra context before the FIM prefix
+    embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
+
+    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+    embd_inp.push_back(llama_vocab_fim_mid(vocab));
+
+    return embd_inp;
+}
+
+server_tokens format_prompt_rerank(
+        const struct llama_model * model,
+        const struct llama_vocab * vocab,
+        mtmd_context * mctx,
+        const std::string & query,
+        const std::string & doc) {
+    server_tokens result = {};
+
+    const char * rerank_prompt = llama_model_chat_template(model, "rerank");
+
+    if (rerank_prompt != nullptr) {
+        std::string prompt = rerank_prompt;
+        string_replace_all(prompt, "{query}"   , query);
+        string_replace_all(prompt, "{document}", doc  );
+        server_tokens tokens = tokenize_input_subprompt(vocab, mctx, prompt, false, true);
+        result.push_back(tokens);
+    } else {
+        // Get EOS token - use SEP token as fallback if EOS is not available
+        server_tokens query_tokens = tokenize_input_subprompt(vocab, mctx, query, false, false);
+        server_tokens doc_tokens   = tokenize_input_subprompt(vocab, mctx, doc,   false, false);
+        llama_token eos_token = llama_vocab_eos(vocab);
+        if (eos_token == LLAMA_TOKEN_NULL) {
+            eos_token = llama_vocab_sep(vocab);
+        }
+
+        if (llama_vocab_get_add_bos(vocab)) {
+            result.push_back(llama_vocab_bos(vocab));
+        }
+        result.push_back(query_tokens);
+        if (llama_vocab_get_add_eos(vocab)) {
+            result.push_back(eos_token);
+        }
+        if (llama_vocab_get_add_sep(vocab)) {
+            result.push_back(llama_vocab_sep(vocab));
+        }
+        result.push_back(doc_tokens);
+        if (llama_vocab_get_add_eos(vocab)) {
+            result.push_back(eos_token);
+        }
+    }
+
+    return result;
+}
diff --git a/llama.cpp/tools/server/server-common.h b/llama.cpp/tools/server/server-common.h
new file mode 100644
index 0000000..2629a6b
--- /dev/null
+++ b/llama.cpp/tools/server/server-common.h
@@ -0,0 +1,366 @@
+#pragma once
+
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "chat.h"
+#include "mtmd.h"
+
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
+#include <string>
+#include <vector>
+#include <cinttypes>
+
+using json = nlohmann::ordered_json;
+
+#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+#define SLT_CNT(slot, fmt, ...) LOG_CNT(""                                 fmt,                                                                __VA_ARGS__)
+#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+
+#define SRV_INF(fmt, ...) LOG_INF("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_CNT(fmt, ...) LOG_CNT(""              fmt,               __VA_ARGS__)
+#define SRV_WRN(fmt, ...) LOG_WRN("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_ERR(fmt, ...) LOG_ERR("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+using raw_buffer = std::vector<uint8_t>;
+
+template <typename T>
+static T json_value(const json & body, const std::string & key, const T & default_value) {
+    // Fallback null to default value
+    if (body.contains(key) && !body.at(key).is_null()) {
+        try {
+            return body.at(key);
+        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) {
+            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what());
+            return default_value;
+        }
+    } else {
+        return default_value;
+    }
+}
+
+// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
+enum error_type {
+    ERROR_TYPE_INVALID_REQUEST,
+    ERROR_TYPE_AUTHENTICATION,
+    ERROR_TYPE_SERVER,
+    ERROR_TYPE_NOT_FOUND,
+    ERROR_TYPE_PERMISSION,
+    ERROR_TYPE_UNAVAILABLE, // custom error
+    ERROR_TYPE_NOT_SUPPORTED, // custom error
+    ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error
+};
+
+// thin wrapper around common_grammar_trigger with (de)serialization functions
+struct server_grammar_trigger {
+    common_grammar_trigger value;
+
+    server_grammar_trigger() = default;
+    server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
+    server_grammar_trigger(const json & in) {
+        value.type = (common_grammar_trigger_type) in.at("type").get<int>();
+        value.value = in.at("value").get<std::string>();
+        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+            value.token = (llama_token) in.at("token").get<int>();
+        }
+    }
+
+    json to_json() const {
+        json out {
+            {"type", (int) value.type},
+            {"value", value.value},
+        };
+        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+            out["token"] = (int) value.token;
+        }
+        return out;
+    }
+};
+
+json format_error_response(const std::string & message, const enum error_type type);
+
+//
+// random string / id
+//
+
+std::string random_string();
+std::string gen_chatcmplid();
+std::string gen_tool_call_id();
+
+//
+// lora utils
+//
+
+// check whether the given lora set has only aloras activated (empty => false)
+bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras);
+
+// if the two sets of loras are different, they require a cache clear unless the
+// change is only from aloras to aloras.
+bool lora_should_clear_cache(
+        const std::vector<common_adapter_lora_info> & current,
+        const std::vector<common_adapter_lora_info> & next);
+
+std::map<int, float> parse_lora_request(const json & data);
+
+bool are_lora_equal(
+        const std::vector<common_adapter_lora_info> & l1,
+        const std::vector<common_adapter_lora_info> & l2);
+
+// get the ids of all enabled loras
+std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras);
+
+//
+// server_tokens
+//
+
+/**
+ * server_tokens is a helper to manage the input tokens and image for the server.
+ * it is made this way to simplify the logic of KV cache management.
+ */
+struct server_tokens {
+    bool has_mtmd = false;
+
+private: // disallow accessing these members directly, risking out-of-sync
+
+    // map a **start** index in tokens to the image chunk
+    // note: the order need to be in-sync with tokens
+    std::map<size_t, mtmd::input_chunk_ptr> map_idx_to_media;
+
+    // list of tokens
+    //   if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk
+    //   otherwise, it is a normal text token
+    // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list
+    // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos
+    llama_tokens tokens;
+
+    // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos):
+    //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1]
+    // idx  0   1   2   3   4   5      6      7      8      9      10
+    // pos  0   1   2   3   4   5      5      5      7      7      7
+    // map_idx_to_media will contain: {5, img0}, {8, img1}
+
+public:
+    server_tokens() = default;
+    ~server_tokens() = default;
+
+    // Prevent copying
+    // TODO: server_tokens should be copyable - remove this:
+    server_tokens(const server_tokens&) = delete;
+    server_tokens& operator=(const server_tokens&) = delete;
+
+    // Allow moving (usually implicitly generated if members are movable)
+    server_tokens(server_tokens&&) = default;
+    server_tokens& operator=(server_tokens&&) = default;
+
+    // Allow accessing elements using [] operator
+    llama_token operator[](size_t index) { return tokens[index]; }
+    const llama_token& operator[](size_t index) const { return tokens[index]; }
+
+    server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd);
+    server_tokens(const llama_tokens & tokens, bool has_mtmd);
+
+    // for debugging
+    std::string str() const;
+
+    llama_pos pos_next() const;
+    const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
+
+    void push_back(llama_token tok);
+
+    // will create a copy of the chunk if it contains non-text data
+    void push_back(const mtmd_input_chunk * chunk);
+
+    // appends server tokens, updates the media map. copies media chunks.
+    void push_back(server_tokens & tokens);
+
+    // for compatibility with context shift and prompt truncation
+    void insert(const llama_tokens & inp_tokens);
+
+    // for compatibility with speculative decoding, ctx shift, slot save/load
+    const llama_tokens & get_text_tokens() const;
+
+    // for compatibility with speculative decoding
+    void set_token(llama_pos pos, llama_token id);
+
+    size_t size() const { return tokens.size(); }
+
+    bool empty() const { return tokens.empty(); }
+
+    void clear() {
+        map_idx_to_media.clear();
+        tokens.clear();
+    }
+
+    void keep_first(size_t n);
+
+    std::string detokenize(const llama_context * ctx, bool special) const;
+
+    size_t get_common_prefix(const server_tokens & b) const;
+
+    // make sure all text tokens are within the vocab range
+    bool validate(const struct llama_context * ctx) const;
+
+    // encode and decode the image chunk
+    int32_t process_chunk(
+                llama_context * ctx,
+                mtmd_context * mctx,
+                size_t idx,
+                llama_pos pos,
+                int32_t seq_id,
+                size_t & n_tokens_out) const;
+
+    server_tokens clone() const;
+};
+
+
+//
+// tokenizer and input processing utils
+//
+
+bool json_is_array_of_numbers(const json & data);
+
+// is array having BOTH numbers & strings?
+bool json_is_array_of_mixed_numbers_strings(const json & data);
+
+// does array have any individual integers/tokens?
+bool json_is_array_and_contains_numbers(const json & data);
+
+// get value by path(key1 / key2)
+json json_get_nested_values(const std::vector<std::string> & paths, const json & js);
+
+/**
+ * this handles 2 cases:
+ * - only string, example: "string"
+ * - mixed string and tokens, example: [12, 34, "string", 56, 78]
+ */
+llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special);
+
+// return the last index of character that can form a valid string
+// if the last character is potentially cut in half, return the index before the cut
+// if validate_utf8(text) == text.size(), then the whole text is valid utf8
+size_t validate_utf8(const std::string& text);
+
+// process mtmd prompt, return the server_tokens containing both text tokens and media chunks
+server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files);
+
+/**
+ * break the input "prompt" object into multiple prompt if needed, then tokenize them
+ * this supports these cases:
+ * - "prompt": "string"
+ * - "prompt": [12, 34, 56]
+ * - "prompt": [12, 34, "string", 56, 78]
+ * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] }
+ * and multiple prompts (multi-tasks):
+ * - "prompt": ["string1", "string2"]
+ * - "prompt": ["string1", [12, 34, 56]]
+ * - "prompt": [[12, 34, 56], [78, 90, 12]]
+ * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}]
+ */
+std::vector<server_tokens> tokenize_input_prompts(
+                                        const llama_vocab * vocab,
+                                        mtmd_context * mctx,
+                                        const json & json_prompt,
+                                        bool add_special,
+                                        bool parse_special);
+
+//
+// OAI utils
+//
+
+// global server parameters for chat formatting / parsing
+struct server_chat_params {
+    bool use_jinja;
+    bool prefill_assistant;
+    common_reasoning_format reasoning_format;
+    std::map<std::string, std::string> chat_template_kwargs; // mapping key --> json value
+    common_chat_templates_ptr tmpls;
+    bool allow_image;
+    bool allow_audio;
+    bool enable_thinking = true;
+    std::string media_path;
+};
+
+// used by /completions endpoint
+json oaicompat_completion_params_parse(const json & body);
+
+// used by /chat/completions endpoint
+json oaicompat_chat_params_parse(
+    json & body, /* openai api json semantics */
+    const server_chat_params & opt,
+    std::vector<raw_buffer> & out_files);
+
+// convert OpenAI Responses API format to OpenAI Chat Completions API format
+json convert_responses_to_chatcmpl(const json & body);
+
+// convert Anthropic Messages API format to OpenAI Chat Completions API format
+json convert_anthropic_to_oai(const json & body);
+
+// TODO: move it to server-task.cpp
+json format_embeddings_response_oaicompat(
+    const json & request,
+    const std::string & model_name,
+    const json & embeddings,
+    bool use_base64 = false);
+
+// TODO: move it to server-task.cpp
+json format_response_rerank(
+        const json & request,
+        const std::string & model_name,
+        const json & ranks,
+        bool is_tei_format,
+        std::vector<std::string> & texts,
+        int top_n);
+
+//
+// other utils
+//
+
+std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx);
+
+std::string safe_json_to_str(const json & data);
+
+std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens);
+std::string tokens_to_str(const llama_vocab * vocab, const llama_tokens & tokens);
+
+// format incomplete utf-8 multibyte character for output
+std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token);
+
+// format server-sent event (SSE), return the formatted string to send
+// note: if data is a json array, it will be sent as multiple events, one per item
+std::string format_oai_sse(const json & data);
+
+std::string format_oai_resp_sse(const json & data);
+
+// format Anthropic-style SSE with event types
+std::string format_anthropic_sse(const json & data);
+
+bool is_valid_utf8(const std::string & str);
+
+//
+// formatting output responses
+// TODO: move these to server-task.cpp
+//
+
+llama_tokens format_prompt_infill(
+        const llama_vocab * vocab,
+        const json & input_prefix,
+        const json & input_suffix,
+        const json & input_extra,
+        const int n_batch,
+        const int n_predict,
+        const int n_ctx,
+        const bool spm_infill,
+        const llama_tokens & tokens_prompt);
+
+// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
+server_tokens format_prompt_rerank(
+        const struct llama_model * model,
+        const struct llama_vocab * vocab,
+        mtmd_context * mctx,
+        const std::string & query,
+        const std::string & doc);
diff --git a/llama.cpp/tools/server/server-context.cpp b/llama.cpp/tools/server/server-context.cpp
new file mode 100644
index 0000000..ceafcac
--- /dev/null
+++ b/llama.cpp/tools/server/server-context.cpp
@@ -0,0 +1,4105 @@
+#include "server-context.h"
+#include "server-common.h"
+#include "server-http.h"
+#include "server-task.h"
+#include "server-queue.h"
+
+#include "common.h"
+#include "llama.h"
+#include "log.h"
+#include "sampling.h"
+#include "speculative.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
+
+#include <cstddef>
+#include <cinttypes>
+#include <memory>
+#include <filesystem>
+
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+using json = nlohmann::ordered_json;
+
+constexpr int HTTP_POLLING_SECONDS = 1;
+
+// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
+enum slot_state {
+    SLOT_STATE_IDLE,
+    SLOT_STATE_WAIT_OTHER, // after assigning a task, but waiting for parent slot to process prompt
+    SLOT_STATE_STARTED,    // after assigning a task and about to process prompt
+    SLOT_STATE_PROCESSING_PROMPT,
+    SLOT_STATE_DONE_PROMPT,
+    SLOT_STATE_GENERATING,
+};
+
+enum server_state {
+    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,          // Server is ready and model is loaded
+};
+
+struct server_slot {
+    int id;
+
+    // TODO: change to unique_ptrs for consistency:
+    llama_context * ctx = nullptr;
+
+    // multimodal
+    mtmd_context * mctx = nullptr;
+
+    common_speculative * spec = nullptr;
+
+    // TODO: move members that belong to the task (such as `generated_text`, `has_new_line`) to task_results_state
+    //       see https://github.com/ggml-org/llama.cpp/pull/18283#issuecomment-3710175837
+    std::unique_ptr<const server_task> task;
+    std::unique_ptr<const server_task> task_prev; // used for debugging
+
+    // used to determine the slot that has been used the longest
+    int64_t t_last_used = -1;
+
+    // generation props
+    int32_t n_ctx       = 0;  // context size per slot
+    int32_t n_keep      = 0;
+    int32_t n_decoded   = 0;
+    int32_t n_remaining = -1;
+    int32_t i_batch     = -1;
+
+    int32_t n_prompt_tokens_cache     = 0;
+    int32_t n_prompt_tokens_processed = 0;
+
+    size_t last_nl_pos = 0;
+
+    std::string  generated_text;
+    llama_tokens generated_tokens;
+
+    // idx of draft tokens in the main batch
+    // non-empty if we went to evaluate draft tokens
+    // ref: https://github.com/ggml-org/llama.cpp/pull/17808
+    std::vector<int32_t> i_batch_dft;
+
+    std::vector<completion_token_output> generated_token_probs;
+
+    bool has_next_token = true;
+    bool has_new_line   = false;
+    bool truncated      = false;
+
+    stop_type stop;
+
+    std::string stopping_word;
+
+    // state
+    slot_state state = SLOT_STATE_IDLE;
+
+    server_prompt prompt;
+
+    void prompt_save(server_prompt_cache & prompt_cache) const {
+        GGML_ASSERT(prompt.data.size() == 0);
+
+        const size_t cur_size = llama_state_seq_get_size_ext(ctx, id, 0);
+
+        SRV_WRN(" - saving prompt with length %d, total state size = %.3f MiB\n",
+                (int) prompt.tokens.size(), cur_size / (1024.0 * 1024.0));
+
+        auto * cur = prompt_cache.alloc(prompt, cur_size);
+        if (cur == nullptr) {
+            return;
+        }
+
+        llama_state_seq_get_data_ext(ctx, cur->data.data(), cur_size, id, 0);
+    }
+
+    bool prompt_load(server_prompt_cache & prompt_cache, const server_tokens & tokens) {
+        bool res = prompt_cache.load(prompt, tokens, ctx, id);
+        if (!res) {
+            SLT_WRN(*this, "%s", "failed to load prompt from cache\n");
+        }
+
+        return res;
+    }
+
+    void prompt_clear(bool allow_processing) {
+        if (!allow_processing) {
+            GGML_ASSERT(!is_processing());
+        }
+
+        SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size());
+
+        llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
+        prompt.tokens.clear();
+    }
+
+    std::vector<common_adapter_lora_info> lora;
+    int32_t alora_invocation_start = -1;
+
+    // sampling
+    json json_schema;
+
+    common_sampler_ptr smpl;
+
+    llama_token  sampled; // in speculative mode, this is the last accepted token
+    llama_tokens drafted;
+
+    // stats
+    size_t n_sent_text = 0; // number of sent text character
+
+    int64_t t_start_process_prompt;
+    int64_t t_start_generation;
+
+    double t_prompt_processing; // ms
+    double t_token_generation;  // ms
+
+    std::function<void(int /* id_slot */)> callback_on_release;
+
+    // Speculative decoding stats
+    int32_t n_draft_total = 0;      // Total draft tokens generated
+    int32_t n_draft_accepted = 0;   // Draft tokens actually accepted
+
+    void reset() {
+        SLT_DBG(*this, "%s", "\n");
+
+        n_prompt_tokens_cache = 0;
+
+        last_nl_pos    = 0;
+        generated_text = "";
+        has_new_line   = false;
+        truncated      = false;
+        stop           = STOP_TYPE_NONE;
+        stopping_word  = "";
+        n_sent_text    = 0;
+
+        drafted.clear();
+        i_batch_dft.clear();
+        generated_tokens.clear();
+        generated_token_probs.clear();
+        json_schema = json();
+
+        // clear speculative decoding stats
+        n_draft_total = 0;
+        n_draft_accepted = 0;
+
+        task_prev = std::move(task);
+        task.reset();
+
+        llama_set_sampler(ctx, id, nullptr);
+
+        // clear alora start
+        alora_invocation_start = -1;
+    }
+
+    void init_sampler() const {
+        common_sampler_reset(smpl.get());
+
+        if (!task->need_sampling()) {
+            return;
+        }
+
+        const int64_t t_start = ggml_time_us();
+
+        int n_text = 0;
+
+        for (int i = 0; i < (int) prompt.tokens.size(); i++) {
+            const llama_token id = prompt.tokens[i];
+
+            if (id != LLAMA_TOKEN_NULL) {
+                common_sampler_accept(smpl.get(), id, false);
+                n_text++;
+            }
+        }
+
+        SLT_INF(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
+                (ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
+    }
+
+    // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
+    // also we cannot split if the pooling would require any past tokens
+    bool can_split() const {
+        GGML_ASSERT(task);
+
+        return
+            !task->need_embd() ||
+            (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
+    }
+
+    bool can_batch_with(server_slot & other_slot) const {
+        GGML_ASSERT(task);
+
+        return task->type == other_slot.task->type && are_lora_equal(lora, other_slot.lora);
+    }
+
+    bool has_budget(const common_params & global_params) {
+        GGML_ASSERT(task);
+
+        if (task->params.n_predict == -1 && global_params.n_predict == -1) {
+            return true; // limitless
+        }
+
+        n_remaining = -1;
+
+        if (task->params.n_predict != -1) {
+            n_remaining = task->params.n_predict - n_decoded;
+        } else if (global_params.n_predict != -1) {
+            n_remaining = global_params.n_predict - n_decoded;
+        }
+
+        return n_remaining > 0; // no budget
+    }
+
+    bool is_processing() const {
+        return state != SLOT_STATE_IDLE;
+    }
+
+    bool can_speculate() const {
+        return !!spec;
+    }
+
+    void add_token(const completion_token_output & token) {
+        if (!is_processing()) {
+            SLT_WRN(*this, "%s", "slot is not processing\n");
+            return;
+        }
+
+        generated_token_probs.push_back(token);
+    }
+
+    int get_n_draft_max() const {
+        GGML_ASSERT(task);
+
+        if (!can_speculate()) {
+            return 0;
+        }
+
+        // determine the max draft that fits the current slot state
+        int n_draft_max = task->params.speculative.n_max;
+
+        // note: slot.prompt is not yet expanded with the `id` token sampled above
+        //       also, need to leave space for 1 extra token to allow context shifts
+        n_draft_max = std::min(n_draft_max, n_ctx - prompt.n_tokens() - 2);
+
+        if (n_remaining > 0) {
+            n_draft_max = std::min(n_draft_max, n_remaining - 1);
+        }
+
+        SLT_DBG(*this, "max possible draft: %d\n", n_draft_max);
+
+        if (n_draft_max < task->params.speculative.n_min) {
+            SLT_DBG(*this, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, task->params.speculative.n_min);
+            n_draft_max = 0;
+        }
+
+        return n_draft_max;
+    }
+
+    void release() {
+        if (is_processing()) {
+            GGML_ASSERT(task);
+
+            SLT_INF(*this, "stop processing: n_tokens = %d, truncated = %d\n", prompt.n_tokens(), truncated);
+
+            t_last_used        =  ggml_time_us();
+            t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
+
+            state = SLOT_STATE_IDLE;
+
+            // do not keep context of the child slots - the parent's context is enough
+            if (task->is_child()) {
+                prompt_clear(false);
+            }
+
+            reset();
+
+            callback_on_release(id);
+        }
+    }
+
+    result_timings get_timings() const {
+        result_timings timings;
+        timings.cache_n = n_prompt_tokens_cache;
+
+        timings.prompt_n            = n_prompt_tokens_processed;
+        timings.prompt_ms           = t_prompt_processing;
+        timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
+        timings.prompt_per_second   = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
+
+        timings.predicted_n            = n_decoded;
+        timings.predicted_ms           = t_token_generation;
+        timings.predicted_per_token_ms = t_token_generation / n_decoded;
+        timings.predicted_per_second   = 1e3 / t_token_generation * n_decoded;
+
+        // Add speculative metrics
+        if (n_draft_total > 0) {
+            timings.draft_n          = n_draft_total;
+            timings.draft_n_accepted = n_draft_accepted;
+        }
+
+        return timings;
+    }
+
+    size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) {
+        GGML_ASSERT(task);
+
+        size_t stop_pos = std::string::npos;
+
+        for (const std::string & word : task->params.antiprompt) {
+            size_t pos;
+
+            if (is_full_stop) {
+                const size_t tmp      = word.size() + last_token_size;
+                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
+
+                pos = text.find(word, from_pos);
+            } else {
+                // otherwise, partial stop
+                pos = string_find_partial_stop(text, word);
+            }
+
+            if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
+                if (is_full_stop) {
+                    stop           = STOP_TYPE_WORD;
+                    stopping_word  = word;
+                    has_next_token = false;
+                }
+                stop_pos = pos;
+            }
+        }
+
+        return stop_pos;
+    }
+
+    void print_timings() const {
+        const double t_prompt        =       t_prompt_processing / n_prompt_tokens_processed;
+        const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
+
+        const double t_gen        =       t_token_generation / n_decoded;
+        const double n_gen_second = 1e3 / t_token_generation * n_decoded;
+
+        SLT_INF(*this,
+                "\n"
+                "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "       eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "      total time = %10.2f ms / %5d tokens\n",
+                t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
+                t_token_generation, n_decoded, t_gen, n_gen_second,
+                t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
+
+        if (n_draft_total > 0) {
+            const float draft_ratio = (float) n_draft_accepted / n_draft_total;
+            SLT_CNT(*this,
+                    "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
+                    draft_ratio, n_draft_accepted, n_draft_total
+            );
+        }
+
+        common_speculative_print_stats(spec);
+    }
+
+    json to_json(bool only_metrics = false) const {
+        json res;
+
+        res = {
+            {"id",            id},
+            {"n_ctx",         n_ctx},
+            {"speculative",   can_speculate()},
+            {"is_processing", is_processing()},
+        };
+
+        const auto & ptask = task ? task : task_prev;
+
+        if (ptask) {
+            res["id_task"] = ptask->id;
+            res["params"] = ptask->params.to_json(only_metrics);
+            res["next_token"] = {
+                {
+                    {"has_next_token", has_next_token},
+                    {"has_new_line",   has_new_line},
+                    {"n_remain",       n_remaining},
+                    {"n_decoded",      n_decoded},
+                }
+            };
+
+            if (!only_metrics) {
+                res["prompt"] = ptask->tokens.detokenize(ctx, true);
+                res["generated"] = generated_text;
+            }
+        }
+
+        return res;
+    }
+
+    void copy_state_to(server_slot & other) const {
+        GGML_ASSERT(state == SLOT_STATE_DONE_PROMPT);
+
+        llama_memory_seq_rm(llama_get_memory(ctx), other.id,     -1, -1);
+        llama_memory_seq_cp(llama_get_memory(ctx), id, other.id, -1, -1);
+
+        other.n_decoded   = n_decoded;
+        other.n_remaining = n_remaining;
+        other.i_batch     = i_batch;
+
+        other.t_start_process_prompt    = t_start_process_prompt;
+        other.t_prompt_processing       = t_prompt_processing;
+        other.n_prompt_tokens_cache     = n_prompt_tokens_cache;
+        other.n_prompt_tokens_processed = n_prompt_tokens_processed;
+
+        other.prompt = prompt.clone();
+        other.init_sampler();
+    }
+};
+
+
+
+//
+// server_metrics
+//
+
+struct server_metrics {
+    int64_t t_start = 0;
+
+    uint64_t n_prompt_tokens_processed_total = 0;
+    uint64_t t_prompt_processing_total       = 0;
+    uint64_t n_tokens_predicted_total        = 0;
+    uint64_t t_tokens_generation_total       = 0;
+
+    uint64_t n_tokens_max = 0;
+
+    uint64_t n_prompt_tokens_processed = 0;
+    uint64_t t_prompt_processing       = 0;
+
+    uint64_t n_tokens_predicted  = 0;
+    uint64_t t_tokens_generation = 0;
+
+    uint64_t n_decode_total     = 0;
+    uint64_t n_busy_slots_total = 0;
+
+    void init() {
+        t_start = ggml_time_us();
+    }
+
+    void on_prompt_eval(const server_slot & slot) {
+        n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
+        n_prompt_tokens_processed       += slot.n_prompt_tokens_processed;
+        t_prompt_processing             += slot.t_prompt_processing;
+        t_prompt_processing_total       += slot.t_prompt_processing;
+
+        n_tokens_max = std::max(n_tokens_max, (uint64_t) slot.prompt.n_tokens());
+    }
+
+    void on_prediction(const server_slot & slot) {
+        n_tokens_predicted_total   += slot.n_decoded;
+        n_tokens_predicted         += slot.n_decoded;
+        t_tokens_generation        += slot.t_token_generation;
+        t_tokens_generation_total  += slot.t_token_generation;
+    }
+
+    void on_decoded(const std::vector<server_slot> & slots) {
+        n_decode_total++;
+        for (const auto & slot : slots) {
+            if (slot.is_processing()) {
+                n_busy_slots_total++;
+            }
+            n_tokens_max = std::max(n_tokens_max, (uint64_t) slot.prompt.n_tokens());
+        }
+    }
+
+    void reset_bucket() {
+        n_prompt_tokens_processed = 0;
+        t_prompt_processing       = 0;
+        n_tokens_predicted        = 0;
+        t_tokens_generation       = 0;
+    }
+};
+
+
+//
+// server_context_impl (private implementation)
+//
+
+struct server_context_impl {
+    friend struct server_context;
+
+public:
+    // only use these pointers outside of this class:
+    //  - when not in sleeping state
+    //  - and, with thread-safe APIs (e.g., tokenizer calls)
+    llama_model * model = nullptr;
+    mtmd_context * mctx = nullptr;
+    const llama_vocab * vocab = nullptr;
+
+    server_queue    queue_tasks;
+    server_response queue_results;
+
+    // note: chat_params must not be refreshed upon existing sleeping state
+    server_chat_params chat_params;
+
+    ~server_context_impl() {
+        if (!sleeping) {
+            // destroy() is already called when entering sleeping state
+            // we don't call it again here to avoid double free
+            destroy();
+        }
+    }
+
+private:
+    // note: accessing these fields outside of this class is not thread-safe
+    // use server_context methods instead
+
+    common_params params_base;
+
+    // note: keep these alive - they determine the lifetime of the model, context, etc.
+    common_init_result_ptr llama_init;
+
+    llama_context * ctx = nullptr;
+
+    llama_batch batch {};
+
+    llama_model_ptr model_dft;
+
+    bool add_bos_token  = true;
+
+    int32_t n_ctx; // total context for all clients / slots
+
+    // slots / clients
+    std::vector<server_slot> slots;
+
+    int slots_debug = 0;
+
+    std::unique_ptr<server_prompt_cache> prompt_cache;
+
+    server_metrics metrics;
+
+    json json_webui_settings = json::object();
+
+    // Necessary similarity of prompt for slot selection
+    float slot_prompt_similarity = 0.0f;
+
+    std::string model_name; // name of the loaded model, to be used by API
+
+    bool sleeping = false;
+
+    void destroy() {
+        llama_init.reset();
+        ctx = nullptr;
+        model = nullptr;
+
+        mtmd_free(mctx);
+        mctx = nullptr;
+
+        // Clear any sampling context
+        for (server_slot & slot : slots) {
+            common_speculative_free(slot.spec);
+            slot.spec = nullptr;
+        }
+
+        llama_batch_free(batch);
+    }
+
+    void handle_sleeping_state(bool new_state) {
+        GGML_ASSERT(sleeping != new_state);
+        if (new_state) {
+            SRV_INF("%s", "server is entering sleeping state\n");
+            destroy();
+        } else {
+            SRV_INF("%s", "server is exiting sleeping state\n");
+            if (!load_model(params_base)) {
+                GGML_ABORT("failed to reload model after sleeping");
+            }
+        }
+        sleeping = new_state;
+    }
+
+    // load the model and initialize llama_context
+    // this may also be called to resume from sleeping state
+    bool load_model(const common_params & params) {
+        bool is_resume = sleeping;
+
+        SRV_INF("loading model '%s'\n", params.model.path.c_str());
+
+        params_base = params;
+
+        llama_init = common_init_from_params(params_base);
+
+        model = llama_init->model();
+        ctx   = llama_init->context();
+
+        if (model == nullptr) {
+            SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
+            return false;
+        }
+
+        vocab = llama_model_get_vocab(model);
+
+        n_ctx = llama_n_ctx(ctx);
+
+        add_bos_token = llama_vocab_get_add_bos(vocab);
+
+        if (params_base.speculative.has_dft()) {
+            SRV_INF("loading draft model '%s'\n", params_base.speculative.mparams_dft.path.c_str());
+
+            const auto & params_spec = params_base.speculative;
+
+            auto params_dft = params_base;
+
+            params_dft.n_parallel   = 1;
+            params_dft.n_ctx        = params_spec.n_ctx == 0 ? llama_n_ctx_seq(ctx) : params_spec.n_ctx;
+            params_dft.n_batch      = llama_n_ctx_seq(ctx);
+            params_dft.devices      = params_spec.devices;
+            params_dft.model        = params_spec.mparams_dft;
+            params_dft.n_gpu_layers = params_spec.n_gpu_layers;
+            params_dft.cache_type_k = params_spec.cache_type_k;
+            params_dft.cache_type_v = params_spec.cache_type_v;
+
+            if (params_spec.cpuparams.n_threads > 0) {
+                params_dft.cpuparams.n_threads       = params_spec.cpuparams.n_threads;
+                params_dft.cpuparams_batch.n_threads = params_spec.cpuparams_batch.n_threads;
+            }
+
+            params_dft.tensor_buft_overrides = params_spec.tensor_buft_overrides;
+
+            auto mparams_dft = common_model_params_to_llama(params_dft);
+
+            model_dft.reset(llama_model_load_from_file(params_dft.model.path.c_str(), mparams_dft));
+            if (model_dft == nullptr) {
+                SRV_ERR("failed to load draft model, '%s'\n", params_dft.model.path.c_str());
+                return false;
+            }
+
+            params_base.speculative.model_dft = model_dft.get();
+            params_base.speculative.cparams_dft = common_context_params_to_llama(params_dft);
+        }
+
+        std::string & mmproj_path = params_base.mmproj.path;
+        if (!mmproj_path.empty()) {
+            if (!is_resume) {
+                mtmd_helper_log_set(common_log_default_callback, nullptr);
+            }
+
+            mtmd_context_params mparams = mtmd_context_params_default();
+
+            mparams.use_gpu          = params_base.mmproj_use_gpu;
+            mparams.print_timings    = false;
+            mparams.n_threads        = params_base.cpuparams.n_threads;
+            mparams.flash_attn_type  = params_base.flash_attn_type;
+            mparams.warmup           = params_base.warmup;
+            mparams.image_min_tokens = params_base.image_min_tokens;
+            mparams.image_max_tokens = params_base.image_max_tokens;
+
+            mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
+            if (mctx == nullptr) {
+                SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
+                return false;
+            }
+            SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str());
+
+            if (params_base.ctx_shift) {
+                params_base.ctx_shift = false;
+                SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
+            }
+
+            if (params_base.n_cache_reuse) {
+                params_base.n_cache_reuse = 0;
+                SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
+            }
+
+            if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
+                params_base.speculative.type =  COMMON_SPECULATIVE_TYPE_NONE;
+                SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
+            }
+        }
+
+        if (!llama_memory_can_shift(llama_get_memory(ctx))) {
+            if (params_base.ctx_shift) {
+                params_base.ctx_shift = false;
+                SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled");
+            }
+
+            if (params_base.n_cache_reuse) {
+                params_base.n_cache_reuse = 0;
+                SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
+            }
+        }
+
+        // Necessary similarity of prompt for slot selection
+        slot_prompt_similarity = params_base.slot_prompt_similarity;
+
+        // setup slots
+        SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
+
+        const int n_ctx_train = llama_model_n_ctx_train(model);
+
+        int n_ctx_slot = llama_n_ctx_seq(ctx);
+        if (n_ctx_slot > n_ctx_train) {
+            SRV_WRN("the slot context (%d) exceeds the training context of the model (%d) - capping\n", n_ctx_slot, n_ctx_train);
+            n_ctx_slot = n_ctx_train;
+        }
+
+        slots.clear();
+
+        const bool can_spec = common_speculative_is_compat(ctx);
+        if (!can_spec) {
+            SRV_WRN("%s", "speculative decoding not supported by this context\n");
+        }
+
+        // initialize slots
+        for (int i = 0; i < params_base.n_parallel; i++) {
+            server_slot slot;
+
+            slot.id    = i;
+            slot.ctx   = ctx;
+            slot.n_ctx = n_ctx_slot;
+
+            slot.mctx                   = mctx;
+            slot.prompt.tokens.has_mtmd = mctx != nullptr;
+
+            // try speculative decoding
+            if (can_spec) {
+                slot.spec = common_speculative_init(params_base.speculative, slot.ctx);
+                if (slot.spec) {
+                    if (mctx) {
+                        SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
+                        return false;
+                    }
+                    SLT_INF(slot, "%s", "speculative decoding context initialized\n");
+                } else {
+                    SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
+                }
+            }
+
+            SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
+
+            slot.callback_on_release = [this](int id_slot) {
+                queue_tasks.pop_deferred_task(id_slot);
+            };
+
+            slot.reset();
+
+            slots.push_back(std::move(slot));
+        }
+
+        {
+            const char * LLAMA_SERVER_SLOTS_DEBUG = getenv("LLAMA_SERVER_SLOTS_DEBUG");
+            slots_debug = LLAMA_SERVER_SLOTS_DEBUG ? atoi(LLAMA_SERVER_SLOTS_DEBUG) : 0;
+
+            if (slots_debug) {
+                SRV_WRN("slots debug = %d\n", slots_debug);
+            }
+        }
+
+        // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
+        // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
+        {
+            const int32_t n_batch = llama_n_batch(ctx);
+            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
+        }
+
+        if (params_base.cache_ram_mib != 0) {
+            if (params_base.cache_ram_mib < 0) {
+                SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit");
+            } else {
+                SRV_WRN("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
+            }
+            SRV_WRN("%s", "use `--cache-ram 0` to disable the prompt cache\n");
+
+            prompt_cache = std::make_unique<server_prompt_cache>(params_base.cache_ram_mib, n_ctx);
+        } else {
+            SRV_WRN("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
+        }
+        SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
+
+        if (!params_base.model_alias.empty()) {
+            // user explicitly specified model name
+            model_name = params_base.model_alias;
+        } else if (!params_base.model.name.empty()) {
+            // use model name in registry format (for models in cache)
+            model_name = params_base.model.name;
+        } else {
+            // fallback: derive model name from file name
+            auto model_path = std::filesystem::path(params_base.model.path);
+            model_name = model_path.filename().string();
+        }
+
+        if (!is_resume) {
+            return init();
+        }
+
+        return true;
+    }
+
+    // unlike load_model(), this is only called once during initialization
+    bool init() {
+        GGML_ASSERT(ctx != nullptr);
+        GGML_ASSERT(model != nullptr);
+        GGML_ASSERT(!sleeping);
+
+        // wiring up server queues
+        queue_tasks.on_new_task([this](server_task && task) {
+            process_single_task(std::move(task));
+        });
+        queue_tasks.on_update_slots([this]() {
+            update_slots();
+        });
+        queue_tasks.on_sleeping_state([this](bool sleeping) {
+            handle_sleeping_state(sleeping);
+        });
+
+        metrics.init();
+
+        // populate webui settings
+        {
+            if (!params_base.webui_config_json.empty()) {
+                try {
+                    json_webui_settings = json::parse(params_base.webui_config_json);
+                } catch (const std::exception & e) {
+                    SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
+                    return false;
+                }
+            }
+        }
+
+        // populate chat template params
+        {
+            common_chat_templates_ptr chat_templates;
+
+            try {
+                chat_templates = common_chat_templates_init(model, params_base.chat_template);
+
+                LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
+                    common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
+
+            } catch (const std::exception & e) {
+                SRV_ERR("%s: chat template parsing error: %s\n", __func__, e.what());
+                SRV_ERR("%s: please consider disabling jinja via --no-jinja, or use a custom chat template via --chat-template\n", __func__);
+                SRV_ERR("%s: for example: --no-jinja --chat-template chatml\n", __func__);
+                return false;
+            }
+
+            // thinking is enabled if:
+            // 1. It's not explicitly disabled (reasoning_budget == 0)
+            // 2. The chat template supports it
+            const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
+            SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
+
+            chat_params = {
+                /* use_jinja             */ params_base.use_jinja,
+                /* prefill_assistant     */ params_base.prefill_assistant,
+                /* reasoning_format      */ params_base.reasoning_format,
+                /* chat_template_kwargs  */ params_base.default_template_kwargs,
+                /* tmpls                 */ std::move(chat_templates),
+                /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
+                /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
+                /* enable_thinking       */ enable_thinking,
+                /* media_path            */ params_base.media_path,
+            };
+        }
+
+        return true;
+    }
+
+    server_slot * get_slot_by_id(int id_slot) {
+        // note: allow id_slot to be out of bounds (wrap around)
+        id_slot = id_slot % slots.size();
+
+        for (server_slot & slot : slots) {
+            if (slot.id == id_slot) {
+                return &slot;
+            }
+        }
+
+        return nullptr;
+    }
+
+    server_slot * get_available_slot(const server_task & task) {
+        server_slot * ret = nullptr;
+
+        bool update_cache = false;
+
+        // find the slot that has at least n% prompt similarity
+        if (ret == nullptr && slot_prompt_similarity != 0.0f) {
+            float sim_best = 0;
+
+            for (server_slot & slot : slots) {
+                // skip the slot if it is not available
+                if (slot.is_processing()) {
+                    continue;
+                }
+
+                const auto & tokens = slot.prompt.tokens;
+
+                // skip the slot if it does not contains cached tokens
+                if (tokens.empty()) {
+                    continue;
+                }
+
+                // fraction of the Longest Common Prefix length with respect to the input prompt length
+                const float sim_cur = float(tokens.get_common_prefix(task.tokens)) / task.tokens.size();
+
+                // select the current slot if the criteria match
+                if (sim_cur > sim_best && sim_cur > slot_prompt_similarity) {
+                    sim_best = sim_cur;
+
+                    ret = &slot;
+                }
+            }
+
+            if (ret != nullptr) {
+                const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size();
+
+                SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n",
+                        sim_best, slot_prompt_similarity, f_keep);
+
+                // if we are about to lose a large portion of the existing context - save it in the prompt cache
+                if (f_keep < 0.5f) {
+                    update_cache = true;
+                }
+            }
+        }
+
+        // find the slot that has been least recently used
+        if (ret == nullptr) {
+            int64_t t_last = -1;
+
+            for (server_slot & slot : slots) {
+                // skip the slot if it is not available
+                if (slot.is_processing()) {
+                    continue;
+                }
+
+                // select the current slot if the criteria match
+                if (!ret || slot.t_last_used <= t_last) {
+                    t_last = slot.t_last_used;
+                    ret = &slot;
+                }
+            }
+
+            if (ret != nullptr) {
+                SLT_INF(*ret, "selected slot by LRU, t_last = %" PRId64 "\n", t_last);
+
+                update_cache = true;
+            }
+        }
+
+        if (ret) {
+            const auto & tokens = ret->prompt.tokens;
+
+            update_cache = update_cache && prompt_cache;
+
+            // cache prompts only for completion tasks
+            update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
+
+            // don't update the cache if the slot's context is empty
+            update_cache = update_cache && tokens.size() > 0;
+
+            // TODO: mtmd does not support prompt cache
+            update_cache = update_cache && (ret->mctx == nullptr);
+
+            if (update_cache) {
+                SRV_WRN("%s", "updating prompt cache\n");
+
+                const int64_t t_start = ggml_time_us();
+
+                ret->prompt_save(*prompt_cache);
+
+                if (!ret->prompt_load(*prompt_cache, task.tokens)) {
+                    ret->prompt_clear(false);
+                }
+
+                prompt_cache->update();
+
+                SRV_WRN("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
+            }
+        }
+
+        return ret;
+    }
+
+    // return true if at least one slot has been cleared
+    // TODO: improve logic
+    //       - smarter decision which slot to clear (LRU or longest prompt?)
+    //       - move slot to level 2 cache instead of removing?
+    //       - instead of purging, try to store and resume later?
+    bool try_clear_idle_slots() {
+        bool res = false;
+
+        if (!params_base.kv_unified) {
+            return res;
+        }
+
+        for (auto & slot : slots) {
+            if (slot.is_processing()) {
+                continue;
+            }
+
+            if (slot.prompt.n_tokens() > 0) {
+                SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());
+
+                slot.prompt_clear(false);
+
+                res = true;
+
+                // clear slots one by one
+                break;
+            }
+        }
+
+        return res;
+    }
+
+    std::vector<common_adapter_lora_info> construct_lora_list(const std::map<int, float> & config) const {
+        std::vector<common_adapter_lora_info> output = params_base.lora_adapters; // copy
+        for (size_t i = 0; i < output.size(); ++i) {
+            auto it = config.find(i);
+            if (it != config.end()) {
+                output[i].scale = it->second;
+            } else {
+                output[i].scale = 0.0f;
+            }
+        }
+        return output;
+    }
+
+    bool launch_slot_with_task(server_slot & slot, server_task && task) {
+        // process per-request lora adapters
+        if (!task.params.lora.empty()) {
+            auto task_loras = construct_lora_list(task.params.lora);
+            if (!are_lora_equal(task_loras, slot.lora)) {
+                // if lora has changed, check to see if the cache should be cleared
+                if (lora_should_clear_cache(slot.lora, task_loras)) {
+                    SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size());
+                    slot.prompt.tokens.clear();
+                } else {
+                    SLT_INF(slot, "keeping cache for alora. %zu target loras\n", task_loras.size());
+                }
+                slot.lora = task_loras;
+            }
+        } else {
+            slot.lora = params_base.lora_adapters;
+        }
+
+        // if using alora, make sure it's only a single one requested and active
+        size_t alora_invocation_start = task.tokens.size();
+        if (lora_all_alora(slot.lora)) {
+            const auto & enabled_ids = lora_get_enabled_ids(slot.lora);
+            // TODO: This will error out if a user requests two aloras, but only
+            // provides the activation string for one. We could, instead search
+            // for all requested alora activation strings and then either keep
+            // only the last one, or reject if multiple are found.
+            if (enabled_ids.size() != 1) {
+                send_error(task, "Cannot run multiple aLoRAs in a single request", ERROR_TYPE_INVALID_REQUEST);
+                return false;
+            }
+            const auto & lora = slot.lora[enabled_ids[0]].ptr;
+
+            // get the pointer and count for the invocation tokens
+            const uint64_t      n_invocation_tokens = llama_adapter_get_alora_n_invocation_tokens(lora);
+            const llama_token * invocation_tokens   = llama_adapter_get_alora_invocation_tokens  (lora);
+
+            // scan backwards through the prompt tokens to find the last
+            // occurrence of the invocation sequence
+            int match_idx = static_cast<int>(n_invocation_tokens) - 1;
+            for (int i = task.tokens.size() - 1; i >= 0; --i) {
+                // the token in this position matches the next token to find in
+                // the invocation sequence
+                if (task.tokens[i] == invocation_tokens[match_idx]) {
+                    // if it's a full match, we've found the start
+                    if (match_idx == 0) {
+                        alora_invocation_start = i;
+                        break;
+                    }
+                    // otherwise, check the next token in the sequence
+                    --match_idx;
+                } else {
+                    // no match in this position, so start looking over again
+                    match_idx = static_cast<int>(n_invocation_tokens) - 1;
+                }
+            }
+
+            // if the activation string is not found, disable the alora
+            if (alora_invocation_start == task.tokens.size()) {
+                SLT_DBG(slot, "alora %zu requested, but not found. deactivating\n", enabled_ids[0]);
+                slot.lora[enabled_ids[0]].scale = 0.0f;
+            } else {
+                SLT_DBG(slot, "alora %zu activated starting at %zu\n", enabled_ids[0], alora_invocation_start);
+                slot.alora_invocation_start = alora_invocation_start;
+            }
+        }
+
+        if (!task.tokens.validate(ctx)) {
+            send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
+            return false;
+        }
+
+        SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
+
+        // initialize samplers
+        if (task.need_sampling()) {
+            slot.smpl.reset(common_sampler_init(model, task.params.sampling));
+
+            if (slot.smpl == nullptr) {
+                // for now, the only error that may happen here is invalid grammar
+                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
+                return false;
+            }
+
+            const bool need_logits = task.params.sampling.n_probs > 0;
+
+            bool backend_sampling = true;
+
+            backend_sampling &= task.params.sampling.backend_sampling;
+
+            // TODO: speculative decoding requires multiple samples per batch - not supported yet
+            backend_sampling &= !(slot.spec && task.params.speculative.n_max > 0);
+
+            // TODO: getting post/pre sampling logits is not yet supported with backend sampling
+            backend_sampling &= !need_logits;
+
+            // TODO: tmp until backend sampling is fully implemented
+            if (backend_sampling) {
+                llama_set_sampler(ctx, slot.id, common_sampler_get(slot.smpl.get()));
+            } else {
+                llama_set_sampler(ctx, slot.id, nullptr);
+            }
+
+            SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
+        } else {
+            slot.smpl.reset();
+        }
+
+        slot.task = std::make_unique<const server_task>(std::move(task));
+
+        slot.state = slot.task->is_child()
+            ? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
+            : SLOT_STATE_STARTED;
+
+        SLT_INF(slot, "processing task, is_child = %d\n", slot.task->is_child());
+        return true;
+    }
+
+    bool process_token(completion_token_output & result, server_slot & slot) {
+        // remember which tokens were sampled - used for repetition penalties during sampling
+        const std::string token_str = result.text_to_send;
+        slot.sampled = result.tok;
+
+        slot.generated_text += token_str;
+        if (slot.task->params.return_tokens) {
+            slot.generated_tokens.push_back(result.tok);
+        }
+        slot.has_next_token = true;
+
+        // check if there is incomplete UTF-8 character at the end
+        bool incomplete = validate_utf8(slot.generated_text) < slot.generated_text.size();
+
+        // search stop word and delete it
+        if (!incomplete) {
+            size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
+
+            const std::string str_test = slot.generated_text.substr(pos);
+            bool send_text = true;
+
+            size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true);
+            if (stop_pos != std::string::npos) {
+                slot.generated_text.erase(
+                    slot.generated_text.begin() + pos + stop_pos,
+                    slot.generated_text.end());
+                pos = std::min(slot.n_sent_text, slot.generated_text.size());
+            } else if (slot.has_next_token && !llama_vocab_is_eog(vocab, result.tok) ) {
+                stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);
+                send_text = stop_pos == std::string::npos;
+            }
+
+            // check if there is any token to predict
+            if (send_text) {
+                // no send the stop word in the response
+                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
+                slot.n_sent_text += result.text_to_send.size();
+                // add the token to slot queue and cache
+            } else {
+                result.text_to_send = "";
+            }
+
+            slot.add_token(result);
+            if (slot.task->params.stream) {
+                send_partial_response(slot, result, false);
+            }
+        }
+
+        if (incomplete) {
+            slot.has_next_token = true;
+        }
+
+        // if context shifting is disabled, make sure that we don't run out of context
+        if (!params_base.ctx_shift && slot.prompt.n_tokens() + 1 >= slot.n_ctx) {
+            slot.truncated      = true;
+            slot.stop           = STOP_TYPE_LIMIT;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped due to running out of context capacity, prompt.n_tokens() = %d, task.n_tokens = %d, n_decoded = %d, n_ctx = %d\n",
+                    slot.prompt.n_tokens(), slot.task->n_tokens(), slot.n_decoded, slot.n_ctx);
+        }
+
+        // check the limits
+        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
+            slot.stop           = STOP_TYPE_LIMIT;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.task->params.n_predict);
+        }
+
+        if (slot.has_new_line) {
+            // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
+            if (slot.task->params.n_indent > 0) {
+                // check the current indentation
+                // TODO: improve by not doing it more than once for each new line
+                if (slot.last_nl_pos > 0) {
+                    size_t pos = slot.last_nl_pos;
+
+                    int n_indent = 0;
+                    while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) {
+                        n_indent++;
+                        pos++;
+                    }
+
+                    if (pos < slot.generated_text.size() && n_indent < slot.task->params.n_indent) {
+                        slot.stop           = STOP_TYPE_LIMIT;
+                        slot.has_next_token = false;
+
+                        // cut the last line
+                        slot.generated_text.erase(pos, std::string::npos);
+
+                        SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent);
+                    }
+                }
+
+                // find the next new line
+                {
+                    const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos);
+
+                    if (pos != std::string::npos) {
+                        slot.last_nl_pos = pos + 1;
+                    }
+                }
+            }
+        }
+
+        // check if there is a new line in the generated text
+        if (result.text_to_send.find('\n') != std::string::npos) {
+            slot.has_new_line = true;
+
+            // if we have seen a new line, we stop after a certain time limit, but only upon another new line
+            if (slot.task->params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.task->params.t_max_predict_ms)) {
+                slot.stop           = STOP_TYPE_LIMIT;
+                slot.has_next_token = false;
+
+                SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.task->params.t_max_predict_ms);
+            }
+        }
+
+        if (llama_vocab_is_eog(vocab, result.tok)) {
+            slot.stop           = STOP_TYPE_EOS;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "%s", "stopped by EOS\n");
+        }
+
+        SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());
+
+        return slot.has_next_token; // continue
+    }
+
+    void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const {
+        const size_t n_probs_request = slot.task->params.sampling.n_probs;
+
+        if (post_sampling) {
+            const auto * cur_p = common_sampler_get_candidates(slot.smpl.get(), true);
+            const size_t max_probs = cur_p->size;
+            const size_t n_probs = std::min(max_probs, n_probs_request);
+
+            // set probability for sampled token
+            for (size_t i = 0; i < max_probs; i++) {
+                if (cur_p->data[i].id == result.tok) {
+                    result.prob = cur_p->data[i].p;
+                    break;
+                }
+            }
+
+            // set probability for top n_probs tokens
+            result.probs.reserve(n_probs);
+            for (size_t i = 0; i < n_probs; i++) {
+                result.probs.push_back({
+                    cur_p->data[i].id,
+                    common_token_to_piece(ctx, cur_p->data[i].id, special),
+                    cur_p->data[i].p
+                });
+            }
+        } else {
+            // TODO: optimize this with min-p optimization
+            std::vector<llama_token_data> cur = get_token_probabilities(ctx, idx);
+            const size_t max_probs = cur.size();
+            const size_t n_probs = std::min(max_probs, n_probs_request);
+
+            // set probability for sampled token
+            for (size_t i = 0; i < max_probs; i++) {
+                // set probability for sampled token
+                if (cur[i].id == result.tok) {
+                    result.prob = cur[i].p;
+                    break;
+                }
+            }
+
+            // set probability for top n_probs tokens
+            result.probs.reserve(n_probs);
+            for (size_t i = 0; i < n_probs; i++) {
+                result.probs.push_back({
+                    cur[i].id,
+                    common_token_to_piece(ctx, cur[i].id, special),
+                    cur[i].p
+                });
+            }
+        }
+    }
+
+    void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
+        send_error(task.id, error, type);
+    }
+
+    void send_error(const server_slot & slot, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
+        send_error(slot.task->id, error, type, slot.task->n_tokens(), slot.n_ctx);
+    }
+
+    void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER, const int32_t n_prompt_tokens = 0, const int32_t n_ctx = 0) {
+        SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());
+
+        if (type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) {
+            GGML_ASSERT(n_ctx > 0 && n_prompt_tokens > 0);
+        }
+
+        auto res = std::make_unique<server_task_result_error>();
+        res->id              = id_task;
+        res->err_type        = type;
+        res->err_msg         = error;
+        res->n_prompt_tokens = n_prompt_tokens;
+        res->n_ctx           = n_ctx;
+
+        queue_results.send(std::move(res));
+    }
+
+    // if multimodal is enabled, send an error and return false
+    bool check_no_mtmd(const int id_task) {
+        if (mctx) {
+            send_error(id_task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
+            return false;
+        }
+        return true;
+    }
+
+    void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) {
+        auto res = std::make_unique<server_task_result_cmpl_partial>();
+
+        res->id    = slot.task->id;
+        res->index = slot.task->index;
+
+        if (is_progress) {
+            res->is_progress        = true;
+            res->progress.total     = slot.task->n_tokens();
+            res->progress.cache     = slot.n_prompt_tokens_cache;
+            res->progress.processed = slot.prompt.tokens.size();
+            res->progress.time_ms   = (ggml_time_us() - slot.t_start_process_prompt) / 1000;
+        } else {
+            res->content = tkn.text_to_send;
+            res->tokens  = { tkn.tok };
+        }
+
+        res->n_decoded           = slot.n_decoded;
+        res->n_prompt_tokens     = slot.task->n_tokens();
+        res->post_sampling_probs = slot.task->params.post_sampling_probs;
+
+        res->verbose           = slot.task->params.verbose;
+        res->res_type          = slot.task->params.res_type;
+        res->oaicompat_model   = slot.task->params.oaicompat_model;
+        res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id;
+
+        // populate res.probs_output
+        if (slot.task->params.sampling.n_probs > 0) {
+            res->prob_output = tkn; // copy the token probs
+        }
+
+        // populate timings if this is final response or timings_per_token is enabled
+        if (slot.stop != STOP_TYPE_NONE || slot.task->params.timings_per_token) {
+            res->timings = slot.get_timings();
+        }
+
+        queue_results.send(std::move(res));
+    }
+
+    void send_final_response(server_slot & slot) {
+        auto res = std::make_unique<server_task_result_cmpl_final>();
+
+        res->id      = slot.task->id;
+        res->id_slot = slot.id;
+
+        res->index           = slot.task->index;
+        // in stream mode, content and tokens are already in last partial chunk
+        if (slot.task->params.stream) {
+            res->content     = "";
+            res->tokens      = llama_tokens{};
+        } else {
+            res->content     = std::move(slot.generated_text);
+            res->tokens      = std::move(slot.generated_tokens);
+        }
+        res->timings         = slot.get_timings();
+        res->prompt          = slot.task->tokens.detokenize(ctx, true);
+        res->response_fields = std::move(slot.task->params.response_fields);
+
+        res->truncated           = slot.truncated;
+        res->n_decoded           = slot.n_decoded;
+        res->n_prompt_tokens     = slot.task->n_tokens();
+        res->n_tokens_cached     = slot.prompt.n_tokens();
+        res->has_new_line        = slot.has_new_line;
+        res->stopping_word       = slot.stopping_word;
+        res->stop                = slot.stop;
+        res->post_sampling_probs = slot.task->params.post_sampling_probs;
+
+        res->verbose           = slot.task->params.verbose;
+        res->stream            = slot.task->params.stream;
+        res->include_usage     = slot.task->params.include_usage;
+        res->res_type          = slot.task->params.res_type;
+        res->oaicompat_model   = slot.task->params.oaicompat_model;
+        res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id;
+
+        // populate res.probs_output
+        if (slot.task->params.sampling.n_probs > 0) {
+            if (!slot.task->params.stream && slot.stop == STOP_TYPE_WORD) {
+                const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
+
+                size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
+                res->probs_output = std::vector<completion_token_output>(
+                        slot.generated_token_probs.begin(),
+                        slot.generated_token_probs.end() - safe_offset);
+            } else {
+                res->probs_output = std::vector<completion_token_output>(
+                        slot.generated_token_probs.begin(),
+                        slot.generated_token_probs.end());
+            }
+        }
+
+        res->generation_params = slot.task->params; // copy the parameters
+
+        queue_results.send(std::move(res));
+    }
+
+    void send_embedding(const server_slot & slot, const llama_batch & batch) {
+        auto res = std::make_unique<server_task_result_embd>();
+        res->id        = slot.task->id;
+        res->index     = slot.task->index;
+        res->n_tokens  = slot.task->n_tokens();
+        res->res_type  = slot.task->params.res_type;
+
+        const int n_embd_out = llama_model_n_embd_out(model);
+
+        std::vector<float> embd_res(n_embd_out, 0.0f);
+
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+                continue;
+            }
+
+            const float * embd = nullptr;
+            if (llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE) {
+                embd = llama_get_embeddings_ith(ctx, i);
+            } else {
+                embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            }
+
+            if (embd == nullptr) {
+                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
+
+                res->embedding.push_back(std::vector<float>(n_embd_out, 0.0f));
+                continue;
+            }
+
+            // normalize only when there is pooling
+            if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
+                common_embd_normalize(embd, embd_res.data(), n_embd_out, slot.task->params.embd_normalize);
+                res->embedding.push_back(embd_res);
+                break;
+            }
+
+            res->embedding.emplace_back(embd, embd + n_embd_out);
+        }
+
+        SLT_DBG(slot, "%s", "sending embeddings\n");
+
+        queue_results.send(std::move(res));
+    }
+
+    void send_rerank(const server_slot & slot, const llama_batch & batch) {
+        auto res = std::make_unique<server_task_result_rerank>();
+        res->id       = slot.task->id;
+        res->index    = slot.task->index;
+        res->n_tokens = slot.task->n_tokens();
+
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+                continue;
+            }
+
+            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            if (embd == NULL) {
+                embd = llama_get_embeddings_ith(ctx, i);
+            }
+
+            if (embd == NULL) {
+                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
+
+                res->score = -1e6;
+                continue;
+            }
+
+            res->score = embd[0];
+        }
+
+        SLT_DBG(slot, "sending rerank result, res.score = %f\n", res->score);
+
+        queue_results.send(std::move(res));
+    }
+
+    //
+    // Functions to process the task
+    //
+
+    // tokenize the input if it's set by CLI, return false on error
+    bool tokenize_cli_input(server_task & task) {
+        try {
+            auto & prompt = task.cli_prompt;
+            if (mctx != nullptr) {
+                task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
+            } else {
+                task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
+            }
+            task.cli_prompt.clear();
+            task.cli_files.clear();
+        } catch (const std::exception & e) {
+            send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
+            return false;
+        }
+        return true;
+    }
+
+    std::vector<server_slot *> get_free_slots(size_t n_slots_needed, int exclude_id_slot) {
+        std::vector<server_slot *> free_slots;
+        for (auto & slot : slots) {
+            if (!slot.is_processing() && slot.id != exclude_id_slot) {
+                free_slots.push_back(&slot);
+            }
+            if (free_slots.size() >= n_slots_needed) {
+                break;
+            }
+        }
+        return free_slots;
+    }
+
+    // launch multiple slots for parent + child tasks
+    bool launch_slots_with_parent_task(server_slot & parent_slot, std::vector<server_slot *> & child_slots, server_task && parent_task) {
+        GGML_ASSERT(!parent_slot.is_processing());
+        GGML_ASSERT(parent_task.is_parent());
+        GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
+
+        int id_parent = parent_task.id;
+
+        SRV_INF("launching slots for parent task id_task = %d with %zu child tasks\n", id_parent, parent_task.child_tasks.size());
+
+        // to be called in case of failure to release all launched slots
+        auto release_slots = [this, id_parent]() {
+            for (auto & slot : slots) {
+                if (slot.is_processing() && (
+                        slot.task->id == id_parent ||
+                        slot.task->id_parent == id_parent
+                )) {
+                    slot.release();
+                }
+            }
+        };
+
+        // launch all child tasks first
+        size_t idx = 0;
+        GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
+        for (auto * slot : child_slots) {
+            int id_child = parent_task.child_tasks[idx].id;
+            if (!launch_slot_with_task(*slot, std::move(parent_task.child_tasks[idx]))) {
+                SRV_ERR("failed to launch slot with child task, id_task = %d\n", id_child);
+                release_slots();
+                return false;
+            }
+            idx++;
+        }
+
+        // finally, launch the parent task
+        if (!launch_slot_with_task(parent_slot, std::move(parent_task))) {
+            SRV_ERR("failed to launch slot with task, id_task = %d\n", id_parent);
+            release_slots();
+            return false;
+        }
+
+        return true;
+    }
+
+    void process_single_task(server_task && task) {
+        switch (task.type) {
+            case SERVER_TASK_TYPE_COMPLETION:
+            case SERVER_TASK_TYPE_INFILL:
+            case SERVER_TASK_TYPE_EMBEDDING:
+            case SERVER_TASK_TYPE_RERANK:
+                {
+                    // special case: if input is provided via CLI, tokenize it first
+                    // otherwise, no need to tokenize as it's already done inside the HTTP thread
+                    if (task.cli) {
+                        if (!tokenize_cli_input(task)) {
+                            break;
+                        }
+                    }
+
+                    const int id_slot = task.id_slot;
+                    const int id_task = task.id;
+
+                    server_slot * slot = id_slot != -1
+                                            ? get_slot_by_id(id_slot)
+                                            : get_available_slot(task);
+
+                    //
+                    // slot scheduling logic
+                    //
+
+                    if (slot == nullptr) {
+                        // if no slot is available, we defer this task for processing later
+                        SRV_DBG("no slot is available, defer task, id_task = %d\n", id_task);
+                        queue_tasks.defer(std::move(task));
+                        break;
+                    }
+
+                    if (slot->is_processing()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", id_task);
+                        queue_tasks.defer(std::move(task));
+                        break;
+                    }
+
+                    if (task.is_parent()) {
+                        // try getting free slots for all child tasks
+                        size_t n_child_tasks = task.child_tasks.size();
+                        std::vector<server_slot *> child_slots = get_free_slots(n_child_tasks, slot->id);
+                        if (child_slots.size() < n_child_tasks) {
+                            SRV_DBG("not enough free slots for child tasks, n_free = %zu, n_children = %zu, defer task, id_task = %d\n", child_slots.size(), n_child_tasks, id_task);
+                            queue_tasks.defer(std::move(task));
+                            break;
+                        }
+                        if (!launch_slots_with_parent_task(*slot, child_slots, std::move(task))) {
+                            SRV_ERR("failed to launch slot with parent task, id_task = %d\n", id_task);
+                            break; // drop the task
+                        }
+                    } else if (!launch_slot_with_task(*slot, std::move(task))) {
+                        SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
+                        break; // drop the task
+                    }
+                } break;
+            case SERVER_TASK_TYPE_CANCEL:
+                {
+                    // release slot linked with the task id
+                    for (auto & slot : slots) {
+                        if (slot.task && slot.task->id == task.id_target) {
+                            slot.release();
+                            break;
+                        }
+                    }
+                } break;
+            case SERVER_TASK_TYPE_NEXT_RESPONSE:
+                {
+                    // do nothing
+                } break;
+            case SERVER_TASK_TYPE_METRICS:
+                {
+                    json slots_data = json::array();
+
+                    int n_idle_slots       = 0;
+                    int n_processing_slots = 0;
+
+                    for (server_slot & slot : slots) {
+                        json slot_data = slot.to_json(slots_debug == 0);
+
+                        if (slot.is_processing()) {
+                            n_processing_slots++;
+                        } else {
+                            n_idle_slots++;
+                        }
+
+                        slots_data.push_back(slot_data);
+                    }
+                    SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots);
+
+                    auto res = std::make_unique<server_task_result_metrics>();
+                    res->id                  = task.id;
+                    res->slots_data          = std::move(slots_data);
+                    res->n_idle_slots        = n_idle_slots;
+                    res->n_processing_slots  = n_processing_slots;
+                    res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred_size();
+                    res->t_start             = metrics.t_start;
+
+                    res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
+                    res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
+                    res->n_tokens_predicted_total        = metrics.n_tokens_predicted_total;
+                    res->t_tokens_generation_total       = metrics.t_tokens_generation_total;
+
+                    res->n_tokens_max = metrics.n_tokens_max;
+
+                    res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
+                    res->t_prompt_processing       = metrics.t_prompt_processing;
+                    res->n_tokens_predicted        = metrics.n_tokens_predicted;
+                    res->t_tokens_generation       = metrics.t_tokens_generation;
+
+                    res->n_decode_total          = metrics.n_decode_total;
+                    res->n_busy_slots_total      = metrics.n_busy_slots_total;
+
+                    if (task.metrics_reset_bucket) {
+                        metrics.reset_bucket();
+                    }
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_SLOT_SAVE:
+                {
+                    if (!check_no_mtmd(task.id)) {
+                        break;
+                    }
+
+                    const int id_slot = task.slot_action.id_slot;
+                    server_slot * slot = get_slot_by_id(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    if (slot->is_processing()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(std::move(task));
+                        break;
+                    }
+
+                    const size_t token_count = slot->prompt.tokens.size();
+                    const int64_t t_start = ggml_time_us();
+
+                    std::string filename = task.slot_action.filename;
+                    std::string filepath = task.slot_action.filepath;
+
+                    const llama_tokens & tokens = slot->prompt.tokens.get_text_tokens();
+                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
+
+                    const int64_t t_end = ggml_time_us();
+                    const double t_save_ms = (t_end - t_start) / 1000.0;
+
+                    auto res = std::make_unique<server_task_result_slot_save_load>();
+                    res->id       = task.id;
+                    res->id_slot  = id_slot;
+                    res->filename = filename;
+                    res->is_save  = true;
+                    res->n_tokens = token_count;
+                    res->n_bytes  = nwrite;
+                    res->t_ms     = t_save_ms;
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_SLOT_RESTORE:
+                {
+                    if (!check_no_mtmd(task.id)) break;
+                    const int id_slot = task.slot_action.id_slot;
+                    server_slot * slot = get_slot_by_id(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    if (slot->is_processing()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(std::move(task));
+                        break;
+                    }
+
+                    const int64_t t_start = ggml_time_us();
+
+                    std::string filename = task.slot_action.filename;
+                    std::string filepath = task.slot_action.filepath;
+
+                    llama_tokens tokens;
+                    tokens.resize(slot->n_ctx);
+                    size_t token_count = 0;
+                    size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count);
+                    if (nread == 0) {
+                        slot->prompt.tokens.clear(); // KV may already been invalidated?
+                        send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    tokens.resize(token_count);
+                    slot->prompt.tokens.clear();
+                    slot->prompt.tokens.insert(tokens);
+
+                    const int64_t t_end = ggml_time_us();
+                    const double t_restore_ms = (t_end - t_start) / 1000.0;
+
+                    auto res = std::make_unique<server_task_result_slot_save_load>();
+                    res->id       = task.id;
+                    res->id_slot  = id_slot;
+                    res->filename = filename;
+                    res->is_save  = false;
+                    res->n_tokens = token_count;
+                    res->n_bytes  = nread;
+                    res->t_ms     = t_restore_ms;
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_SLOT_ERASE:
+                {
+                    if (!check_no_mtmd(task.id)) {
+                        break;
+                    }
+                    const int id_slot = task.slot_action.id_slot;
+                    server_slot * slot = get_slot_by_id(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    if (slot->is_processing()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
+                        queue_tasks.defer(std::move(task));
+                        break;
+                    }
+
+                    // Erase token cache
+                    const size_t n_erased = slot->prompt.tokens.size();
+
+                    slot->prompt_clear(false);
+
+                    auto res = std::make_unique<server_task_result_slot_erase>();
+                    res->id       = task.id;
+                    res->id_slot  = id_slot;
+                    res->n_erased = n_erased;
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_GET_LORA:
+                {
+                    // TODO @ngxson : make lora_adapters a dedicated member of server_context
+                    auto & loras = params_base.lora_adapters;
+                    auto res = std::make_unique<server_task_result_get_lora>();
+                    res->id = task.id;
+                    for (size_t i = 0; i < loras.size(); ++i) {
+                        auto & lora = loras[i];
+                        std::string alora_invocation_string = "";
+                        const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr);
+                        llama_tokens alora_invocation_tokens;
+                        if (n_alora_tokens) {
+                            const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr);
+                            for (uint64_t j = 0; j < n_alora_tokens; ++j) {
+                                alora_invocation_string += common_token_to_piece(vocab, alora_tokens[j]);
+                                alora_invocation_tokens.push_back(alora_tokens[j]);
+                            }
+                        }
+                        res->loras.push_back(server_task_result_get_lora::lora{
+                            lora,
+                            alora_invocation_string,
+                            alora_invocation_tokens,
+                        });
+                    }
+                    queue_results.send(std::move(res));
+                } break;
+            case SERVER_TASK_TYPE_SET_LORA:
+                {
+                    auto new_loras = construct_lora_list(task.set_lora);
+                    // logging
+                    for (size_t i = 0; i < new_loras.size(); ++i) {
+                        SRV_INF("set lora adapter idx=%zu scale=%f\n", i, new_loras[i].scale);
+                    }
+                    // TODO @ngxson : make lora_adapters a dedicated member of server_context
+                    params_base.lora_adapters = new_loras;
+                    auto res = std::make_unique<server_task_result_apply_lora>();
+                    res->id = task.id;
+                    queue_results.send(std::move(res));
+                } break;
+        }
+    }
+
+    void update_slots() {
+        // check if all slots are idle
+        {
+            bool all_idle = true;
+
+            for (auto & slot : slots) {
+                if (slot.is_processing()) {
+                    all_idle = false;
+                    break;
+                }
+            }
+
+            if (all_idle) {
+                SRV_INF("%s", "all slots are idle\n");
+
+                return;
+            }
+        }
+
+        {
+            SRV_DBG("%s", "posting NEXT_RESPONSE\n");
+
+            server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
+            task.id = queue_tasks.get_new_id();
+            queue_tasks.post(std::move(task));
+        }
+
+        // apply context-shift if needed
+        // TODO: simplify and improve
+        for (server_slot & slot : slots) {
+            if (slot.state == SLOT_STATE_GENERATING && slot.prompt.n_tokens() + 1 >= slot.n_ctx) {
+                if (!params_base.ctx_shift) {
+                    // this check is redundant (for good)
+                    // we should never get here, because generation should already stopped in process_token()
+                    send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
+                    slot.release();
+                    continue;
+                }
+
+                if (mctx) {
+                    // we should never reach this because params_base.ctx_shift is automatically disabled if mmproj is loaded
+                    // we don't support ctx_shift because an image chunk may contains multiple tokens
+                    GGML_ABORT("not supported by multimodal");
+                }
+
+                if (slot.task->is_parent() || slot.task->is_child()) {
+                    send_error(slot, "context shift cannot be used for shared prompt", ERROR_TYPE_SERVER);
+                    slot.release();
+                    continue;
+                }
+
+                // Shift context
+                int n_keep = slot.task->params.n_keep < 0 ? slot.task->n_tokens() : slot.task->params.n_keep;
+
+                if (add_bos_token) {
+                    n_keep += 1;
+                }
+
+                n_keep = std::min(slot.n_ctx - 4, n_keep);
+
+                const int n_left    = slot.prompt.n_tokens() - n_keep;
+                const int n_discard = slot.task->params.n_discard ? slot.task->params.n_discard : (n_left / 2);
+
+                SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
+
+                llama_memory_seq_rm (llama_get_memory(ctx), slot.id, n_keep            , n_keep + n_discard);
+                llama_memory_seq_add(llama_get_memory(ctx), slot.id, n_keep + n_discard, slot.prompt.n_tokens(), -n_discard);
+
+                // add generated tokens to cache
+                // ref: https://github.com/ggml-org/llama.cpp/pull/16818#discussion_r2473269481
+                {
+                    GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
+
+                    llama_tokens new_tokens = slot.prompt.tokens.get_text_tokens(); // copy
+                    for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
+                        new_tokens[i - n_discard] = new_tokens[i];
+                    }
+
+                    new_tokens.resize(slot.prompt.tokens.size() - n_discard);
+
+                    slot.prompt.tokens.clear();
+                    slot.prompt.tokens.insert(new_tokens);
+                }
+
+                slot.truncated = true;
+            }
+        }
+
+        // start populating the batch for this iteration
+        common_batch_clear(batch);
+
+        // track if given slot can be batched with slots already in the batch
+        server_slot * slot_batched = nullptr;
+
+        auto accept_special_token = [&](server_slot & slot, llama_token token) {
+            return params_base.special ||
+                slot.task->params.sampling.preserved_tokens.find(token) != slot.task->params.sampling.preserved_tokens.end();
+        };
+
+        // first, add sampled tokens from any ongoing sequences
+        for (auto & slot : slots) {
+            if (slot.state != SLOT_STATE_GENERATING) {
+                continue;
+            }
+
+            // check if we can batch this slot with the previous one
+            if (!slot_batched) {
+                slot_batched = &slot;
+            } else if (!slot_batched->can_batch_with(slot)) {
+                continue;
+            }
+
+            // generate draft tokens in speculative decoding mode
+            // TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
+            //       perform the speculative drafting for all sequences at the same time in a single batch
+            const int n_draft_max = slot.get_n_draft_max();
+            if (n_draft_max > 0) {
+                if (mctx) {
+                    // we should never reach this, as speculative is automatically disabled if mmproj is loaded
+                    GGML_ABORT("not supported by multimodal");
+                }
+
+                const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
+
+                const auto & params_spec = slot.task->params.speculative;
+
+                llama_tokens draft = common_speculative_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled);
+
+                if (draft.size() > (size_t) n_draft_max) {
+                    SLT_WRN(slot, "draft size %d exceeds max %d, truncating\n", (int) draft.size(), n_draft_max);
+                    draft.resize(n_draft_max);
+                }
+
+                // add the sampled token to the batch
+                slot.i_batch_dft.push_back(batch.n_tokens);
+                common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
+                slot.prompt.tokens.push_back(slot.sampled);
+
+                if (slot.task->params.speculative.n_min > (int) draft.size()) {
+                    SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min);
+                    // fallback to normal decoding
+                    slot.i_batch = slot.i_batch_dft[0];
+                    slot.drafted.clear();
+                    slot.i_batch_dft.clear();
+                } else {
+                    // keep track of total number of drafted tokens tested
+                    slot.n_draft_total += draft.size();
+
+                    // add all drafted tokens to the batch
+                    for (size_t i = 0; i < draft.size(); i++) {
+                        slot.i_batch_dft.push_back(batch.n_tokens);
+                        common_batch_add(batch, draft[i], slot.prompt.tokens.pos_next(), { slot.id }, true);
+                        slot.prompt.tokens.push_back(draft[i]);
+                    }
+                    slot.drafted = std::move(draft);
+                }
+            } else {
+                // no speculative decoding
+                slot.i_batch = batch.n_tokens;
+
+                common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
+
+                slot.prompt.tokens.push_back(slot.sampled);
+
+                SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n",
+                        slot.n_ctx, slot.prompt.n_tokens(), slot.truncated);
+            }
+        }
+
+        // process in chunks of params.n_batch
+        int32_t n_batch  = llama_n_batch(ctx);
+        int32_t n_ubatch = llama_n_ubatch(ctx);
+
+        float  alora_scale       = -1.0f;
+        size_t alora_disabled_id = 0;
+
+        // next, batch any pending prompts without exceeding n_batch
+        if (params_base.cont_batching || batch.n_tokens == 0) {
+            for (auto & slot : slots) {
+                if (!slot.is_processing()) {
+                    continue;
+                }
+
+                // check if we can batch this slot with the previous one
+                if (slot_batched && !slot_batched->can_batch_with(slot)) {
+                    continue;
+                }
+
+                // check if this is a child slot
+                if (slot.state == SLOT_STATE_WAIT_OTHER) {
+                    SLT_DBG(slot, "%s", "waiting for parent slot to complete\n");
+                    continue;
+                }
+
+                // this slot still has a prompt to be processed
+                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
+                    const auto & input_tokens = slot.task->tokens;
+
+                    // TODO: maybe move branch to outside of this loop in the future
+                    if (slot.state == SLOT_STATE_STARTED) {
+                        slot.t_start_process_prompt = ggml_time_us();
+                        slot.t_start_generation = 0;
+
+                        slot.state = SLOT_STATE_PROCESSING_PROMPT;
+
+                        SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n",
+                                slot.n_ctx, slot.task->params.n_keep, slot.task->n_tokens());
+
+                        // print prompt tokens (for debugging)
+                        /*if (1) {
+                            // first 16 tokens (avoid flooding logs)
+                            for (int i = 0; i < std::min<int>(16, input_tokens.size()); i++) {
+                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx, input_tokens[i]).c_str());
+                            }
+                        } else {
+                            // all
+                            for (int i = 0; i < (int) input_tokens.size(); i++) {
+                                SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, input_tokens[i], common_token_to_piece(ctx, input_tokens[i]).c_str());
+                            }
+                        }*/
+
+                        // keep track how many tokens we can reuse from the previous state
+                        int n_past = 0;
+
+                        // empty prompt passed -> release the slot and send empty response
+                        if (input_tokens.empty()) {
+                            SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
+
+                            slot.print_timings();
+                            send_final_response(slot);
+                            slot.release();
+
+                            continue;
+                        }
+
+                        // TODO: support memory-less logits computation
+                        if (slot.task->need_logits() && !llama_get_memory(ctx)) {
+                            send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
+                            slot.release();
+                            continue;
+                        }
+
+                        if (!slot.can_split()) {
+                            if (slot.task->n_tokens() > n_ubatch) {
+                                send_error(slot,
+                                           string_format(
+                                               "input (%d tokens) is too large to process. increase the physical batch "
+                                               "size (current batch size: %d)",
+                                               slot.task->n_tokens(), n_ubatch),
+                                           ERROR_TYPE_SERVER);
+                                slot.release();
+                                continue;
+                            }
+
+                            if (slot.task->n_tokens() > slot.n_ctx) {
+                                send_error(
+                                    slot,
+                                    string_format(
+                                        "input (%d tokens) is larger than the max context size (%d tokens). skipping",
+                                        slot.task->n_tokens(), slot.n_ctx),
+                                    ERROR_TYPE_EXCEED_CONTEXT_SIZE);
+                                slot.release();
+                                continue;
+                            }
+                        } else {
+                            if (slot.task->n_tokens() >= slot.n_ctx) {
+                                send_error(slot,
+                                           string_format("request (%d tokens) exceeds the available context size (%d "
+                                                         "tokens), try increasing it",
+                                                         slot.task->n_tokens(), slot.n_ctx),
+                                           ERROR_TYPE_EXCEED_CONTEXT_SIZE);
+                                slot.release();
+                                continue;
+                            }
+
+                            if (slot.task->params.cache_prompt) {
+                                // reuse any previously computed tokens that are common with the new prompt
+                                n_past = slot.prompt.tokens.get_common_prefix(input_tokens);
+
+                                // if there is an alora invoked, don't cache after the invocation start
+                                if (slot.alora_invocation_start > 0) {
+                                    SLT_DBG(slot, "only caching to alora invocation start (n_past = %d, alora_invocation_start = %d)\n", n_past, slot.alora_invocation_start);
+                                    n_past = std::min(n_past, slot.alora_invocation_start - 1);
+                                }
+
+                                const auto n_cache_reuse = slot.task->params.n_cache_reuse;
+
+                                const bool can_cache_reuse =
+                                    llama_memory_can_shift(llama_get_memory(ctx)) &&
+                                    !slot.prompt.tokens.has_mtmd;
+
+                                if (!can_cache_reuse && n_cache_reuse > 0) {
+                                    SLT_WRN(slot, "cache reuse is not supported - ignoring n_cache_reuse = %d\n", n_cache_reuse);
+                                }
+
+                                // reuse chunks from the cached prompt by shifting their KV cache in the new position
+                                if (can_cache_reuse && n_cache_reuse > 0) {
+                                    GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
+
+                                    size_t head_c = n_past; // cache
+                                    size_t head_p = n_past; // current prompt
+
+                                    if (mctx) {
+                                        // we should never reach this
+                                        GGML_ABORT("not supported by multimodal");
+                                    }
+
+                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, n_past = %d\n", n_cache_reuse, n_past);
+
+                                    while (head_c < slot.prompt.tokens.size() &&
+                                           head_p < input_tokens.size()) {
+
+                                        size_t n_match = 0;
+                                        while (head_c + n_match < slot.prompt.tokens.size() &&
+                                               head_p + n_match < input_tokens.size()       &&
+                                               slot.prompt.tokens[head_c + n_match] == input_tokens[head_p + n_match]) {
+                                            n_match++;
+                                        }
+
+                                        if (n_match >= (size_t) n_cache_reuse) {
+                                            SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
+                                            //for (size_t i = head_p; i < head_p + n_match; i++) {
+                                            //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
+                                            //}
+
+                                            const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
+
+                                            llama_memory_seq_rm (llama_get_memory(ctx), slot.id, head_p, head_c);
+                                            llama_memory_seq_add(llama_get_memory(ctx), slot.id, head_c, head_c + n_match, kv_shift);
+
+                                            for (size_t i = 0; i < n_match; i++) {
+                                                slot.prompt.tokens.set_token(head_p + i, slot.prompt.tokens[head_c + i]);
+                                                n_past++;
+                                            }
+
+                                            head_c += n_match;
+                                            head_p += n_match;
+                                        } else {
+                                            head_c += 1;
+                                        }
+                                    }
+
+                                    SLT_DBG(slot, "after context reuse, new n_past = %d\n", n_past);
+                                }
+                            } else {
+                                // if we don't cache the prompt, we have to remove all previous tokens
+                                n_past = 0;
+                            }
+
+                            // note: when n_swa == 0, the model does not use SWA, which is equivalent to a window of 1
+                            const auto n_swa = std::max(1, llama_model_n_swa(model));
+
+                            // the largest pos_min required for a checkpoint to be useful
+                            const auto pos_min_thold = std::max(0, n_past - n_swa);
+
+                            // note: disallow with mtmd contexts for now
+                            //       https://github.com/ggml-org/llama.cpp/issues/17043
+                            if (!mctx && n_past > 0 && n_past < slot.prompt.n_tokens()) {
+                                const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
+                                if (pos_min == -1) {
+                                    SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
+                                    GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
+                                }
+
+                                // when the prompt prefix does not match, print the tokens around the mismatch
+                                // this is useful for debugging prompt caching
+                                if (slots_debug) {
+                                    const int np0 = std::max<int>(n_past - 4, 0);
+                                    const int np1 = std::min<int>(n_past + 6, std::min(slot.prompt.tokens.size(), slot.task->tokens.size()));
+
+                                    std::stringstream ss0;
+                                    std::stringstream ss1;
+
+                                    std::stringstream st0;
+                                    std::stringstream st1;
+
+                                    ss0 << "old: ... ";
+                                    ss1 << "new: ... ";
+
+                                    for (int i = np0; i < np1; i++) {
+                                        if (i == n_past) {
+                                            ss0 << " | ";
+                                            ss1 << " | ";
+                                        }
+
+                                        {
+                                            const auto token = slot.prompt.tokens[i];
+                                            const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]";
+                                            ss0 << piece;
+                                            st0 << std::setw(8) << token;
+                                        }
+
+                                        {
+                                            const auto token = slot.task->tokens[i];
+                                            const auto piece = token != LLAMA_TOKEN_NULL ? common_token_to_piece(ctx, token) : "[mtmd]";
+                                            ss1 << piece;
+                                            st1 << std::setw(8) << token;
+                                        }
+                                    }
+
+                                    SLT_WRN(slot, "%s\n", ss0.str().c_str());
+                                    SLT_WRN(slot, "%s\n", ss1.str().c_str());
+
+                                    SLT_WRN(slot, "%s\n", st0.str().c_str());
+                                    SLT_WRN(slot, "%s\n", st1.str().c_str());
+                                }
+
+                                if (pos_min > pos_min_thold) {
+                                    // TODO: support can be added in the future when corresponding vision models get released
+                                    GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
+
+                                    SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
+
+                                    // search for a context checkpoint
+                                    const auto it = std::find_if(
+                                        slot.prompt.checkpoints.rbegin(),
+                                        slot.prompt.checkpoints.rend(),
+                                        [&](const auto & cur) {
+                                            // guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]
+                                            return cur.pos_min < pos_min_thold;
+                                        }
+                                    );
+
+                                    bool do_reset = it == slot.prompt.checkpoints.rend();
+
+                                    if (!do_reset) {
+                                        // restore the context checkpoint
+                                        const size_t checkpoint_size = it->data.size();
+                                        const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+
+                                        if (n != checkpoint_size) {
+                                            SLT_ERR(slot, "failed to restore context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024);
+                                            do_reset = true;
+                                            //printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint");
+                                        } else {
+                                            n_past = std::min(n_past, std::max(it->pos_min + 1, it->pos_max));
+                                            SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024);
+                                        }
+                                    }
+
+                                    if (do_reset) {
+                                        SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n",
+                                                "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+                                        n_past = 0;
+                                    }
+                                }
+                            }
+
+                            {
+                                // erase any checkpoints with pos_min > pos_min_thold
+                                for (auto it = slot.prompt.checkpoints.begin(); it != slot.prompt.checkpoints.end();) {
+                                    const auto & cur = *it;
+                                    if (cur.pos_min > pos_min_thold) {
+                                        SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, n_swa, (float) cur.data.size() / 1024 / 1024);
+                                        it = slot.prompt.checkpoints.erase(it);
+                                    } else {
+                                        ++it;
+                                    }
+                                }
+                            }
+                        }
+
+                        // [TAG_PROMPT_LOGITS]
+                        if (n_past == slot.task->n_tokens() && n_past > 0) {
+                            SLT_WRN(slot, "need to evaluate at least 1 token for each active slot (n_past = %d, task.n_tokens() = %d)\n", n_past, slot.task->n_tokens());
+                            n_past--;
+                            SLT_WRN(slot, "n_past was set to %d\n", n_past);
+                        }
+
+                        slot.n_prompt_tokens_cache     = n_past;
+                        slot.n_prompt_tokens_processed = 0;
+
+                        slot.prompt.tokens.keep_first(n_past);
+
+                        // send initial 0% progress update if needed
+                        // this is to signal the client that the request has started processing
+                        if (slot.task->params.stream && slot.task->params.return_progress) {
+                            send_partial_response(slot, {}, true);
+                        }
+                    }
+
+                    if (!slot.can_split()) {
+                        // cannot fit the prompt in the current batch - will try next iter
+                        if (batch.n_tokens + slot.task->n_tokens() > n_batch) {
+                            continue;
+                        }
+                    }
+
+                    // truncate any tokens that are beyond n_past for this slot
+                    const llama_pos p0 = slot.prompt.tokens.pos_next();
+
+                    SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
+
+                    if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
+                        SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
+
+                        slot.prompt_clear(true);
+
+                        // there is no common part left
+                        slot.n_prompt_tokens_cache = 0;
+                    }
+
+                    // check if we should process the image
+                    if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
+                        // process the image
+                        size_t n_tokens_out = 0;
+                        int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
+                        if (res != 0) {
+                            SLT_ERR(slot, "failed to process image, res = %d\n", res);
+                            send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
+                            slot.release();
+                            continue;
+                        }
+
+                        slot.n_prompt_tokens_processed += n_tokens_out;
+
+                        // add the image chunk to cache
+                        {
+                            const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
+                            slot.prompt.tokens.push_back(chunk.get()); // copy
+                        }
+                    }
+
+                    // If using an alora, there may be uncached tokens that come
+                    // before the invocation sequence. When this happens, the
+                    // tokens before the invocation sequence need to be
+                    // processed without the adapter in a separate batch, then
+                    // the adapter needs to be enabled for the remaining tokens.
+                    if (lora_all_alora(slot.lora) && slot.alora_invocation_start - 1 > slot.prompt.n_tokens()) {
+                        SLT_DBG(slot, "processing pre-alora tokens without the adapter (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start);
+                        const auto & enabled_loras = lora_get_enabled_ids(slot.lora);
+                        GGML_ASSERT(enabled_loras.size() == 1);
+                        alora_scale = slot.lora[enabled_loras[0]].scale;
+                        slot.lora[enabled_loras[0]].scale = 0.0f;
+                        alora_disabled_id = enabled_loras[0];
+                    }
+
+                    bool do_checkpoint = params_base.n_ctx_checkpoints > 0;
+
+                    // make checkpoints only for completion tasks
+                    do_checkpoint = do_checkpoint && slot.task->type == SERVER_TASK_TYPE_COMPLETION;
+
+                    // make a checkpoint of the parts of the memory that cannot be rolled back.
+                    // checkpoints are created only if:
+                    // - the model uses SWA and we are not using `swa_full`
+                    // - the model architecture is marked as recurrent or hybrid
+                    //
+                    // TODO: try to make this conditional on the context or the memory module, instead of the model type
+                    do_checkpoint = do_checkpoint && (
+                            llama_model_is_recurrent(model) ||
+                            llama_model_is_hybrid(model) ||
+                            (llama_model_n_swa(model) > 0 && !params_base.swa_full)
+                            );
+
+                    // add prompt tokens for processing in the current batch
+                    while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch) {
+                        // get next token to process
+                        llama_token cur_tok = input_tokens[slot.prompt.n_tokens()];
+                        if (cur_tok == LLAMA_TOKEN_NULL) {
+                            break; // end of text chunk
+                        }
+
+                        // if this is an alora request with pre-invocation
+                        // tokens that are not cached, we need to stop filling
+                        // this batch at those pre-invocation tokens.
+                        if (alora_scale > 0 && slot.prompt.n_tokens() == slot.alora_invocation_start - 1) {
+                            SLT_DBG(slot, "stop prompt batch filling at (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start);
+                            break;
+                        }
+
+                        // embedding requires all tokens in the batch to be output
+                        common_batch_add(batch,
+                            cur_tok,
+                            slot.prompt.tokens.pos_next(),
+                            { slot.id },
+                            slot.task->need_embd());
+                        slot.prompt.tokens.push_back(cur_tok);
+
+                        slot.n_prompt_tokens_processed++;
+
+                        // process the last few tokens of the prompt separately in order to allow for a checkpoint to be created.
+                        const int n_last = std::min(n_batch, 512);
+                        if (do_checkpoint && slot.task->n_tokens() == slot.prompt.n_tokens() + n_last) {
+                            break;
+                        }
+                    }
+
+                    // SLT_INF(slot, "new slot.prompt.tokens: %s\n", slot.slot.prompt.tokens.str().c_str());
+
+                    SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens());
+
+                    // entire prompt has been processed
+                    if (slot.prompt.n_tokens() == slot.task->n_tokens()) {
+                        slot.state = SLOT_STATE_DONE_PROMPT;
+
+                        GGML_ASSERT(batch.n_tokens > 0);
+
+                        // extract the logits only for the last token
+                        batch.logits[batch.n_tokens - 1] = true;
+
+                        slot.n_decoded = 0;
+                        slot.i_batch   = batch.n_tokens - 1;
+
+                        SLT_INF(slot, "prompt done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);
+
+                        slot.init_sampler();
+
+                        const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
+                        const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id);
+
+                        // no need for empty or small checkpoints
+                        do_checkpoint = do_checkpoint && (pos_min >= 0 && pos_max >= 64);
+
+                        // no need to create checkpoints that are too close together
+                        do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || pos_max > slot.prompt.checkpoints.back().pos_max + 64);
+
+                        if (do_checkpoint) {
+                            while (slot.prompt.checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
+                                // make room for the new checkpoint, if needed
+                                const auto & cur = slot.prompt.checkpoints.front();
+
+                                SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
+                                        cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
+
+                                slot.prompt.checkpoints.erase(slot.prompt.checkpoints.begin());
+                            }
+
+                            const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+
+                            auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{
+                                /*.pos_min = */ pos_min,
+                                /*.pos_max = */ pos_max,
+                                /*.data    = */ std::vector<uint8_t>(checkpoint_size),
+                            });
+
+                            llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+
+                            SLT_WRN(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
+                                    (int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
+                        }
+                    }
+                }
+
+                if (!slot_batched) {
+                    slot_batched = &slot;
+                }
+
+                if (batch.n_tokens >= n_batch) {
+                    break;
+                }
+            }
+        }
+
+        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
+
+        if (slot_batched) {
+            // apply lora, only need to do it once per batch
+            common_set_adapter_lora(ctx, slot_batched->lora);
+
+            // if the lora is temporarily disabled for an alora, re-enable it
+            // for next time
+            if (alora_scale > 0.0f) {
+                SRV_DBG("re-enabling alora with scale %f\n", alora_scale);
+                slot_batched->lora[alora_disabled_id].scale = alora_scale;
+            }
+
+            llama_set_embeddings(ctx, slot_batched->task->need_embd());
+        }
+
+        if (batch.n_tokens == 0) {
+            SRV_WRN("%s", "no tokens to decode\n");
+        }
+
+        int32_t i_next = 0;
+
+        // process the created batch of tokens
+        for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
+            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
+
+            llama_batch batch_view = {
+                n_tokens,
+                batch.token    + i,
+                nullptr,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
+            };
+
+            const int ret = llama_decode(ctx, batch_view);
+
+            metrics.on_decoded(slots);
+
+            if (ret != 0) {
+                {
+                    std::string err;
+
+                    if (n_batch == 1 && ret == 1) {
+                        // TODO: try to terminate only the largest active slot/sequence and continue with the rest
+                        //       need to remove the tokens from the current batch too
+                        err = "Context size has been exceeded.";
+                    }
+
+                    if (ret == -1) {
+                        err = "Invalid input batch.";
+                    }
+
+                    if (ret < -1) {
+                        // TODO: update slot state based on llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+                        err = "Compute error.";
+                    }
+
+                    // TODO: handle ret == 2 (abort) when we start aborting
+
+                    if (!err.empty()) {
+                        SRV_ERR("%s i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
+
+                        for (auto & slot : slots) {
+                            if (slot.is_processing()) {
+                                send_error(slot, err);
+                                slot.release();
+
+                                // note: it's complicated to keep track of how much of the current batch has been
+                                //       processed before the error occurred, so we simply clear the entire context
+                                slot.prompt_clear(false);
+                            }
+                        }
+
+                        break;
+                    }
+                }
+
+                // retry with half the batch size to try to find a free slot in the KV cache
+                if (!try_clear_idle_slots()) {
+                    n_batch /= 2;
+                }
+
+                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
+
+                continue; // continue loop of n_batch
+            }
+
+            // move the head of the batch forward with the number of tokens we just processed
+            i_next = i + n_tokens;
+
+            // on successful decode, restore the original batch size
+            n_batch = llama_n_batch(ctx);
+
+            // handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too
+            for (auto & slot : slots) {
+                if (slot.state == SLOT_STATE_DONE_PROMPT && slot.task->is_parent()) {
+                    std::vector<server_slot *> children;
+                    for (auto & other : slots) {
+                        if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) {
+                            children.push_back(&other);
+                        }
+                    }
+
+                    // all children slots should already launched by launch_slots_with_parent_task()
+                    // copy state to the child slots
+                    for (auto & child : children) {
+                        SLT_INF(slot, " - copying state to child %d\n", child->id);
+
+                        GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER);
+
+                        slot.copy_state_to(*child);
+                        child->state = SLOT_STATE_DONE_PROMPT;
+                    }
+                }
+            }
+
+            for (auto & slot : slots) {
+                // optionally send prompt processing progress
+                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) {
+                    if (slot.task->params.stream && slot.task->params.return_progress) {
+                        send_partial_response(slot, {}, true);
+                    }
+                }
+
+                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
+                    continue; // continue loop of slots
+                }
+
+                if (slot.state == SLOT_STATE_DONE_PROMPT) {
+                    if (slot.task->type == SERVER_TASK_TYPE_EMBEDDING) {
+                        // prompt evaluated for embedding
+                        send_embedding(slot, batch_view);
+                        slot.release();
+                        slot.i_batch = -1;
+                        continue; // continue loop of slots
+                    }
+
+                    if (slot.task->type == SERVER_TASK_TYPE_RERANK) {
+                        send_rerank(slot, batch_view);
+                        slot.release();
+                        slot.i_batch = -1;
+                        continue; // continue loop of slots
+                    }
+
+                    GGML_ASSERT(slot.task->need_sampling());
+
+                    // prompt evaluated for next-token prediction
+                    slot.state = SLOT_STATE_GENERATING;
+
+                    if (slot.can_speculate()) {
+                        common_speculative_begin(slot.spec, slot.prompt.tokens.get_text_tokens());
+                    }
+                } else if (slot.state != SLOT_STATE_GENERATING) {
+                    continue; // continue loop of slots
+                }
+
+                if (slot.i_batch_dft.size() > 0) {
+                    continue; // sample using speculative decoding
+                }
+
+                const int tok_idx = slot.i_batch - i;
+
+                llama_token id = common_sampler_sample(slot.smpl.get(), ctx, tok_idx);
+
+                slot.i_batch = -1;
+
+                common_sampler_accept(slot.smpl.get(), id, true);
+
+                // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
+                const int64_t t_current = ggml_time_us();
+
+                slot.n_decoded += 1;
+
+                if (slot.n_decoded == 1) {
+                    slot.t_start_generation = t_current;
+                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
+                    metrics.on_prompt_eval(slot);
+                }
+
+                slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
+
+                completion_token_output result;
+                result.tok          = id;
+                result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
+                result.prob         = 1.0f; // TODO: set it here instead of doing inside populate_token_probs
+
+                if (slot.task->params.sampling.n_probs > 0) {
+                    populate_token_probs(slot, result, slot.task->params.post_sampling_probs, params_base.special, tok_idx);
+                }
+
+                if (!process_token(result, slot)) {
+                    // release slot because of stop condition
+                    slot.print_timings();
+                    send_final_response(slot);
+                    metrics.on_prediction(slot);
+                    slot.release();
+
+                    continue;
+                }
+            }
+
+            // speculative decoding - main model sample and accept
+            for (auto & slot : slots) {
+                if (slot.state != SLOT_STATE_GENERATING || slot.i_batch_dft.empty()) {
+                    continue;
+                }
+
+                const size_t n_draft = slot.drafted.size();
+
+                // the accepted tokens from the speculation
+                const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
+                slot.i_batch_dft.clear();
+                slot.drafted.clear();
+
+                const int64_t t_current = ggml_time_us();
+
+                slot.n_decoded += ids.size();
+
+                slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
+
+                // update how many tokens out of those tested were accepted
+                slot.n_draft_accepted += ids.size() - 1;
+
+                // inform the speculative decoding about the number of accepted tokens
+                common_speculative_accept(slot.spec, ids.size() - 1);
+
+                // rollback to the state before sampling the draft tokens
+                slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
+
+                // add accepted tokens to the prompt
+                slot.prompt.tokens.insert({ids.begin(), ids.end() - 1});
+                slot.sampled = ids.back(); // last accepted token
+
+                llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1);
+
+                for (size_t i = 0; i < ids.size(); ++i) {
+                    completion_token_output result;
+
+                    result.tok          = ids[i];
+                    result.text_to_send = common_token_to_piece(ctx, result.tok, accept_special_token(slot, result.tok));
+                    result.prob         = 1.0f; // set later
+
+                    // TODO: set result.probs
+
+                    if (!process_token(result, slot)) {
+                        slot.print_timings();
+                        send_final_response(slot);
+                        metrics.on_prediction(slot);
+                        slot.release();
+
+                        break;
+                    }
+                }
+
+                SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) n_draft, slot.prompt.n_tokens());
+            }
+        }
+
+        SRV_DBG("%s", "run slots completed\n");
+    }
+
+    int get_slot_n_ctx() {
+        return slots.back().n_ctx;
+    }
+
+    server_response_reader get_response_reader() {
+        return server_response_reader(queue_tasks, queue_results, HTTP_POLLING_SECONDS);
+    }
+};
+
+//
+// server_context (public API)
+//
+
+server_context::server_context() : impl(new server_context_impl()) {}
+server_context::~server_context() = default;
+
+bool server_context::load_model(const common_params & params) {
+    return impl->load_model(params);
+}
+
+void server_context::start_loop() {
+    auto & params = impl->params_base;
+    impl->queue_tasks.start_loop(params.sleep_idle_seconds * 1000);
+}
+
+void server_context::terminate() {
+    impl->queue_tasks.terminate();
+}
+
+llama_context * server_context::get_llama_context() const {
+    return impl->ctx;
+}
+
+server_response_reader server_context::get_response_reader() {
+    return impl->get_response_reader();
+}
+
+server_context_meta server_context::get_meta() const {
+    auto bos_id = llama_vocab_bos(impl->vocab);
+    auto eos_id = llama_vocab_eos(impl->vocab);
+    auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
+    auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, eos_id, true) : "";
+
+    return server_context_meta {
+        /* build_info             */ build_info,
+        /* model_name             */ impl->model_name,
+        /* model_path             */ impl->params_base.model.path,
+        /* has_mtmd               */ impl->mctx != nullptr,
+        /* has_inp_image          */ impl->chat_params.allow_image,
+        /* has_inp_audio          */ impl->chat_params.allow_audio,
+        /* json_webui_settings    */ impl->json_webui_settings,
+        /* slot_n_ctx             */ impl->get_slot_n_ctx(),
+        /* pooling_type           */ llama_pooling_type(impl->ctx),
+
+        /* chat_params            */ impl->chat_params,
+        /* chat_template_caps     */ common_chat_templates_get_caps(impl->chat_params.tmpls.get()),
+
+        /* bos_token_str          */ bos_token_str,
+        /* eos_token_str          */ eos_token_str,
+        /* fim_pre_token          */ llama_vocab_fim_pre(impl->vocab),
+        /* fim_sub_token          */ llama_vocab_fim_suf(impl->vocab),
+        /* fim_mid_token          */ llama_vocab_fim_mid(impl->vocab),
+
+        /* model_vocab_type       */ llama_vocab_type(impl->vocab),
+        /* model_vocab_n_tokens   */ llama_vocab_n_tokens(impl->vocab),
+        /* model_n_ctx_train      */ llama_model_n_ctx_train(impl->model),
+        /* model_n_embd_inp       */ llama_model_n_embd(impl->model),
+        /* model_n_params         */ llama_model_n_params(impl->model),
+        /* model_size             */ llama_model_size(impl->model),
+    };
+}
+
+
+
+// generator-like API for HTTP response generation
+// may have bypass_sleep = true if the task does not use ctx_server
+struct server_res_generator : server_http_res {
+    server_response_reader rd;
+    server_res_generator(server_queue & queue_tasks, server_response & queue_results, int sleep_idle_seconds, bool bypass_sleep = false)
+            : rd(queue_tasks, queue_results, HTTP_POLLING_SECONDS) {
+        // fast path in case sleeping is disabled
+        bypass_sleep |= sleep_idle_seconds < 0;
+        if (!bypass_sleep) {
+            queue_tasks.wait_until_no_sleep();
+        }
+    }
+    void ok(const json & response_data) {
+        status = 200;
+        data = safe_json_to_str(response_data);
+    }
+    void error(const json & error_data) {
+        status = json_value(error_data, "code", 500);
+        data = safe_json_to_str({{ "error", error_data }});
+    }
+};
+
+
+
+//
+// server_routes
+//
+
+std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
+            const server_http_req & req,
+            server_task_type type,
+            const json & data,
+            const std::vector<raw_buffer> & files,
+            task_response_type res_type) {
+    GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
+
+    auto res = create_response();
+    auto completion_id = gen_chatcmplid();
+    auto & rd = res->rd;
+
+    try {
+        std::vector<server_task> tasks;
+
+        const auto & prompt = data.at("prompt");
+        // TODO: this log can become very long, put it behind a flag or think about a more compact format
+        //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
+
+        // process prompt
+        std::vector<server_tokens> inputs;
+
+        if (res_type != TASK_RESPONSE_TYPE_NONE && ctx_server.mctx != nullptr) {
+            // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below.
+            inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
+        } else {
+            // Everything else, including multimodal completions.
+            inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
+        }
+
+        // tasks.reserve(inputs.size()); // TODO: this is inaccurate due to child tasks
+
+        for (size_t i = 0; i < inputs.size(); i++) {
+            server_task task = server_task(type);
+
+            task.id = rd.get_new_id();
+
+            task.tokens = std::move(inputs[i]);
+            task.params = server_task::params_from_json_cmpl(
+                    ctx_server.vocab,
+                    params,
+                    meta->slot_n_ctx,
+                    data);
+            task.id_slot = json_value(data, "id_slot", -1);
+
+            // OAI-compat
+            task.params.res_type          = res_type;
+            task.params.oaicompat_cmpl_id = completion_id;
+            task.params.oaicompat_model   = meta->model_name;
+
+            // prepare child tasks
+            if (task.params.n_cmpl > 1) {
+                int n_children = task.params.n_cmpl - 1;
+                for (int j = 0; j < n_children; j++) {
+                    task.add_child(task.id, rd.get_new_id());
+                }
+            }
+
+            tasks.push_back(std::move(task));
+        }
+
+        rd.post_tasks(std::move(tasks));
+    } catch (const std::exception & e) {
+        res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
+        return res;
+    }
+
+    bool stream = json_value(data, "stream", false);
+
+    if (!stream) {
+        // non-stream, wait for the results
+        auto all_results = rd.wait_for_all(req.should_stop);
+        if (all_results.is_terminated) {
+            return res; // connection is closed
+        } else if (all_results.error) {
+            res->error(all_results.error->to_json());
+            return res;
+        } else {
+            json arr = json::array();
+            for (auto & res : all_results.results) {
+                GGML_ASSERT(dynamic_cast<server_task_result_cmpl_final*>(res.get()) != nullptr);
+                arr.push_back(res->to_json());
+            }
+            GGML_ASSERT(!arr.empty() && "empty results");
+            if (arr.size() == 1) {
+                // if single request, return single object instead of array
+                res->ok(arr[0]);
+            } else if (res_type == TASK_RESPONSE_TYPE_OAI_CHAT || res_type == TASK_RESPONSE_TYPE_OAI_CMPL) {
+                // if multiple results in OAI format, we need to re-format them
+                json & choices = arr[0]["choices"];
+                for (size_t i = 1; i < arr.size(); i++) {
+                    choices.push_back(std::move(arr[i]["choices"][0]));
+                }
+                res->ok(arr[0]);
+            } else {
+                // multi-results, non-OAI compat
+                res->ok(arr);
+            }
+        }
+    } else {
+        // in streaming mode, the first error must be treated as non-stream response
+        // this is to match the OAI API behavior
+        // ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309
+        auto first_result = rd.next(req.should_stop);
+        if (first_result == nullptr) {
+            GGML_ASSERT(req.should_stop());
+            return res; // connection is closed
+        }
+
+        if (first_result->is_error()) {
+            res->error(first_result->to_json());
+            return res;
+        }
+
+        GGML_ASSERT(
+            dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr ||
+            dynamic_cast<server_task_result_cmpl_final*>  (first_result.get()) != nullptr
+        );
+
+        // next responses are streamed
+        // to be sent immediately
+        json first_result_json = first_result->to_json();
+        if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
+            res->data = format_anthropic_sse(first_result_json);
+        } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
+            res->data = format_oai_resp_sse(first_result_json);
+        } else {
+            res->data = format_oai_sse(first_result_json);
+        }
+        res->status = 200;
+        res->content_type = "text/event-stream";
+        res->next = [res_this = res.get(), res_type, &req](std::string & output) -> bool {
+            static auto format_error = [](task_response_type res_type, const json & res_json) {
+                if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
+                    return format_anthropic_sse({
+                        {"event", "error"},
+                        {"data", res_json},
+                    });
+                } else {
+                    return format_oai_sse(json {{ "error", res_json }});
+                }
+            };
+
+            try {
+                if (req.should_stop()) {
+                    SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
+                    return false; // should_stop condition met
+                }
+
+                if (!res_this->data.empty()) {
+                    // flush the first chunk
+                    output = std::move(res_this->data);
+                    res_this->data.clear();
+                    return true;
+                }
+
+                server_response_reader & rd = res_this->rd;
+
+                // check if there is more data
+                if (!rd.has_next()) {
+                    switch (res_type) {
+                        case TASK_RESPONSE_TYPE_NONE:
+                        case TASK_RESPONSE_TYPE_OAI_RESP:
+                        case TASK_RESPONSE_TYPE_ANTHROPIC:
+                            output = "";
+                            break;
+
+                        default:
+                            output = "data: [DONE]\n\n";
+                            break;
+                    }
+                    SRV_DBG("%s", "all results received, terminating stream\n");
+                    return false; // no more data, terminate
+                }
+
+                // receive subsequent results
+                auto result = rd.next(req.should_stop);
+                if (result == nullptr) {
+                    SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
+                    GGML_ASSERT(req.should_stop());
+                    return false; // should_stop condition met
+                }
+
+                // send the results
+                if (result->is_error()) {
+                    json res_json = result->to_json();
+                    output = format_error(res_type, res_json);
+                    SRV_DBG("%s", "error received during streaming, terminating stream\n");
+                    return false; // terminate on error
+                } else {
+                    GGML_ASSERT(
+                        dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr
+                        || dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
+                    );
+                    json res_json = result->to_json();
+                    if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
+                        output = format_anthropic_sse(res_json);
+                    } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) {
+                        output = format_oai_resp_sse(res_json);
+                    } else {
+                        output = format_oai_sse(res_json);
+                    }
+                }
+
+                // has next data, continue
+                return true;
+
+            } catch (const std::exception & e) {
+                json error_json = format_error_response(e.what(), ERROR_TYPE_SERVER);
+                output = format_error(res_type, error_json);
+
+                // terminate on exception
+                return false;
+            }
+        };
+    }
+
+    return res;
+}
+
+std::unique_ptr<server_res_generator> server_routes::create_response(bool bypass_sleep) {
+    return std::make_unique<server_res_generator>(queue_tasks, queue_results, params.sleep_idle_seconds, bypass_sleep);
+}
+
+server_routes::server_routes(const common_params & params, server_context & ctx_server)
+        : params(params),
+          ctx_server(*ctx_server.impl),
+          queue_tasks(ctx_server.impl->queue_tasks),
+          queue_results(ctx_server.impl->queue_results) {
+    init_routes();
+}
+
+void server_routes::init_routes() {
+    // IMPORTANT: all lambda functions must start with create_response()
+    // this is to ensure that the server_res_generator can handle sleeping case correctly
+
+    this->get_health = [this](const server_http_req &) {
+        // error and loading states are handled by middleware
+        auto res = create_response(true);
+
+        // this endpoint can be accessed during sleeping
+        // the next LOC is to avoid someone accidentally use ctx_server
+        bool ctx_server; // do NOT delete this line
+        GGML_UNUSED(ctx_server);
+
+        res->ok({{"status", "ok"}});
+        return res;
+    };
+
+    this->get_metrics = [this](const server_http_req & req) {
+        auto res = create_response();
+        if (!params.endpoint_metrics) {
+            res->error(format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        // request slots data using task queue
+        {
+            server_task task(SERVER_TASK_TYPE_METRICS);
+            task.id = res->rd.get_new_id();
+            res->rd.post_task(std::move(task), true); // high-priority task
+        }
+
+        // get the result
+        auto result = res->rd.next(req.should_stop);
+        if (!result) {
+            // connection was closed
+            GGML_ASSERT(req.should_stop());
+            return res;
+        }
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        // TODO: get rid of this dynamic_cast
+        auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
+        GGML_ASSERT(res_task != nullptr);
+
+        // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
+        json all_metrics_def = json {
+            {"counter", {{
+                    {"name",  "prompt_tokens_total"},
+                    {"help",  "Number of prompt tokens processed."},
+                    {"value",  (uint64_t) res_task->n_prompt_tokens_processed_total}
+            }, {
+                    {"name",  "prompt_seconds_total"},
+                    {"help",  "Prompt process time"},
+                    {"value",  (uint64_t) res_task->t_prompt_processing_total / 1.e3}
+            }, {
+                    {"name",  "tokens_predicted_total"},
+                    {"help",  "Number of generation tokens processed."},
+                    {"value",  (uint64_t) res_task->n_tokens_predicted_total}
+            }, {
+                    {"name",  "tokens_predicted_seconds_total"},
+                    {"help",  "Predict process time"},
+                    {"value",  (uint64_t) res_task->t_tokens_generation_total / 1.e3}
+            }, {
+                    {"name",  "n_decode_total"},
+                    {"help",  "Total number of llama_decode() calls"},
+                    {"value",  res_task->n_decode_total}
+            }, {
+                    {"name",  "n_tokens_max"},
+                    {"help",  "Largest observed n_tokens."},
+                    {"value",  res_task->n_tokens_max}
+            }, {
+                    {"name",  "n_busy_slots_per_decode"},
+                    {"help",  "Average number of busy slots per llama_decode() call"},
+                    {"value",  (float) res_task->n_busy_slots_total / std::max((float) res_task->n_decode_total, 1.f)}
+            }}},
+            {"gauge", {{
+                    {"name",  "prompt_tokens_seconds"},
+                    {"help",  "Average prompt throughput in tokens/s."},
+                    {"value",  res_task->n_prompt_tokens_processed ? 1.e3 / res_task->t_prompt_processing * res_task->n_prompt_tokens_processed : 0.}
+            },{
+                    {"name",  "predicted_tokens_seconds"},
+                    {"help",  "Average generation throughput in tokens/s."},
+                    {"value",  res_task->n_tokens_predicted ? 1.e3 / res_task->t_tokens_generation * res_task->n_tokens_predicted : 0.}
+            },{
+                    {"name",  "requests_processing"},
+                    {"help",  "Number of requests processing."},
+                    {"value",  (uint64_t) res_task->n_processing_slots}
+            },{
+                    {"name",  "requests_deferred"},
+                    {"help",  "Number of requests deferred."},
+                    {"value",  (uint64_t) res_task->n_tasks_deferred}
+            }}}
+        };
+
+        std::stringstream prometheus;
+
+        for (const auto & el : all_metrics_def.items()) {
+            const auto & type        = el.key();
+            const auto & metrics_def = el.value();
+
+            for (const auto & metric_def : metrics_def) {
+                const std::string name = metric_def.at("name");
+                const std::string help = metric_def.at("help");
+
+                auto value = json_value(metric_def, "value", 0.);
+                prometheus << "# HELP llamacpp:" << name << " " << help  << "\n"
+                            << "# TYPE llamacpp:" << name << " " << type  << "\n"
+                            << "llamacpp:"        << name << " " << value << "\n";
+            }
+        }
+
+        res->headers["Process-Start-Time-Unix"] = std::to_string(res_task->t_start);
+        res->content_type = "text/plain; version=0.0.4";
+        res->status = 200;
+        res->data = prometheus.str();
+        return res;
+    };
+
+    this->get_slots = [this](const server_http_req & req) {
+        auto res = create_response();
+        if (!params.endpoint_slots) {
+            res->error(format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        // request slots data using task queue
+        {
+            server_task task(SERVER_TASK_TYPE_METRICS);
+            task.id = res->rd.get_new_id();
+            res->rd.post_task(std::move(task), true); // high-priority task
+        }
+
+        // get the result
+        auto result = res->rd.next(req.should_stop);
+        if (!result) {
+            // connection was closed
+            GGML_ASSERT(req.should_stop());
+            return res;
+        }
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        // TODO: get rid of this dynamic_cast
+        auto * res_task = dynamic_cast<server_task_result_metrics*>(result.get());
+        GGML_ASSERT(res_task != nullptr);
+
+        // optionally return "fail_on_no_slot" error
+        if (!req.get_param("fail_on_no_slot").empty()) {
+            if (res_task->n_idle_slots == 0) {
+                res->error(format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE));
+                return res;
+            }
+        }
+
+        res->ok(res_task->slots_data);
+        return res;
+    };
+
+    this->post_slots = [this](const server_http_req & req) {
+        auto res = create_response();
+        if (params.slot_save_path.empty()) {
+            res->error(format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        std::string id_slot_str = req.get_param("id_slot");
+
+        int id_slot;
+        try {
+            id_slot = std::stoi(id_slot_str);
+        } catch (const std::exception &) {
+            res->error(format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        std::string action = req.get_param("action");
+
+        if (action == "save") {
+            return handle_slots_save(req, id_slot);
+        }
+        if (action == "restore") {
+            return handle_slots_restore(req, id_slot);
+        }
+        if (action == "erase") {
+            return handle_slots_erase(req, id_slot);
+        }
+
+        res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
+        return res;
+    };
+
+    this->get_props = [this](const server_http_req &) {
+        auto res = create_response(true);
+
+        // this endpoint can be accessed during sleeping
+        // the next LOC is to avoid someone accidentally use ctx_server
+        bool ctx_server; // do NOT delete this line
+        GGML_UNUSED(ctx_server);
+
+        task_params tparams;
+        tparams.sampling = params.sampling;
+        json default_generation_settings_for_props = json {
+            { "params", tparams.to_json(true) },
+            { "n_ctx",  meta->slot_n_ctx },
+        };
+
+        std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
+        std::string tmpl_tools   = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use");
+
+        json props = {
+            { "default_generation_settings", default_generation_settings_for_props },
+            { "total_slots",                 params.n_parallel },
+            { "model_alias",                 meta->model_name },
+            { "model_path",                  meta->model_path },
+            { "modalities",                  json {
+                {"vision", meta->has_inp_image},
+                {"audio",  meta->has_inp_audio},
+            } },
+            { "endpoint_slots",              params.endpoint_slots },
+            { "endpoint_props",              params.endpoint_props },
+            { "endpoint_metrics",            params.endpoint_metrics },
+            { "webui",                       params.webui },
+            { "webui_settings",              meta->json_webui_settings },
+            { "chat_template",               tmpl_default },
+            { "chat_template_caps",          meta->chat_template_caps },
+            { "bos_token",                   meta->bos_token_str },
+            { "eos_token",                   meta->eos_token_str },
+            { "build_info",                  meta->build_info },
+            { "is_sleeping",                 queue_tasks.is_sleeping() },
+        };
+        if (params.use_jinja) {
+            if (!tmpl_tools.empty()) {
+                props["chat_template_tool_use"] = tmpl_tools;
+            }
+        }
+        res->ok(props);
+        return res;
+    };
+
+    this->post_props = [this](const server_http_req &) {
+        auto res = create_response();
+        if (!params.endpoint_props) {
+            res->error(format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+        // update any props here
+
+        res->ok({{ "success", true }});
+        return res;
+    };
+
+    this->get_api_show = [this](const server_http_req &) {
+        auto res = create_response();
+        std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
+        json data = {
+            {
+                "model_info", {
+                    { "llama.context_length", meta->slot_n_ctx },
+                }
+            },
+            {"modelfile", ""},
+            {"parameters", ""},
+            {"template", tmpl_default},
+            {"details", {
+                {"parent_model", ""},
+                {"format", "gguf"},
+                {"family", ""},
+                {"families", {""}},
+                {"parameter_size", ""},
+                {"quantization_level", ""}
+            }},
+            {"model_info", ""},
+            {"capabilities", meta->has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}
+        };
+
+        res->ok(data);
+        return res;
+    };
+
+    this->post_infill = [this](const server_http_req & req) {
+        auto res = create_response();
+        // check model compatibility
+        std::string err;
+        if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+            err += "prefix token is missing. ";
+        }
+        if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+            err += "suffix token is missing. ";
+        }
+        if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+            err += "middle token is missing. ";
+        }
+        if (!err.empty()) {
+            res->error(format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        // validate input
+        json data = json::parse(req.body);
+        if (data.contains("prompt") && !data.at("prompt").is_string()) {
+            // prompt is optional
+            res->error(format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (!data.contains("input_prefix")) {
+            res->error(format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (!data.contains("input_suffix")) {
+            res->error(format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (data.contains("input_extra") && !data.at("input_extra").is_array()) {
+            // input_extra is optional
+            res->error(format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        json input_extra = json_value(data, "input_extra", json::array());
+        for (const auto & chunk : input_extra) {
+            // { "text": string, "filename": string }
+            if (!chunk.contains("text") || !chunk.at("text").is_string()) {
+                res->error(format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+            // filename is optional
+            if (chunk.contains("filename") && !chunk.at("filename").is_string()) {
+                res->error(format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+        }
+        data["input_extra"] = input_extra; // default to empty array if it's not exist
+
+        std::string prompt = json_value(data, "prompt", std::string());
+        std::vector<server_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true);
+        SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
+        data["prompt"] = format_prompt_infill(
+            ctx_server.vocab,
+            data.at("input_prefix"),
+            data.at("input_suffix"),
+            data.at("input_extra"),
+            params.n_batch,
+            params.n_predict,
+            meta->slot_n_ctx,
+            params.spm_infill,
+            tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
+        );
+
+        std::vector<raw_buffer> files; // dummy
+        return handle_completions_impl(
+            req,
+            SERVER_TASK_TYPE_INFILL,
+            data,
+            files,
+            TASK_RESPONSE_TYPE_NONE); // infill is not OAI compatible
+    };
+
+    this->post_completions = [this](const server_http_req & req) {
+        auto res = create_response();
+        std::vector<raw_buffer> files; // dummy
+        const json body = json::parse(req.body);
+        return handle_completions_impl(
+            req,
+            SERVER_TASK_TYPE_COMPLETION,
+            body,
+            files,
+            TASK_RESPONSE_TYPE_NONE);
+    };
+
+    this->post_completions_oai = [this](const server_http_req & req) {
+        auto res = create_response();
+        std::vector<raw_buffer> files; // dummy
+        const json body = json::parse(req.body);
+        return handle_completions_impl(
+            req,
+            SERVER_TASK_TYPE_COMPLETION,
+            body,
+            files,
+            TASK_RESPONSE_TYPE_OAI_CMPL);
+    };
+
+    this->post_chat_completions = [this](const server_http_req & req) {
+        auto res = create_response();
+        std::vector<raw_buffer> files;
+        json body = json::parse(req.body);
+        json body_parsed = oaicompat_chat_params_parse(
+            body,
+            meta->chat_params,
+            files);
+        return handle_completions_impl(
+            req,
+            SERVER_TASK_TYPE_COMPLETION,
+            body_parsed,
+            files,
+            TASK_RESPONSE_TYPE_OAI_CHAT);
+    };
+
+    this->post_responses_oai = [this](const server_http_req & req) {
+        auto res = create_response();
+        std::vector<raw_buffer> files;
+        json body = convert_responses_to_chatcmpl(json::parse(req.body));
+        SRV_DBG("%s\n", "Request converted: OpenAI Responses -> OpenAI Chat Completions");
+        SRV_DBG("converted request: %s\n", body.dump().c_str());
+        json body_parsed = oaicompat_chat_params_parse(
+            body,
+            meta->chat_params,
+            files);
+        return handle_completions_impl(
+            req,
+            SERVER_TASK_TYPE_COMPLETION,
+            body_parsed,
+            files,
+            TASK_RESPONSE_TYPE_OAI_RESP);
+    };
+
+    this->post_anthropic_messages = [this](const server_http_req & req) {
+        auto res = create_response();
+        std::vector<raw_buffer> files;
+        json body = convert_anthropic_to_oai(json::parse(req.body));
+        SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
+        SRV_DBG("converted request: %s\n", body.dump().c_str());
+        json body_parsed = oaicompat_chat_params_parse(
+            body,
+            meta->chat_params,
+            files);
+        return handle_completions_impl(
+            req,
+            SERVER_TASK_TYPE_COMPLETION,
+            body_parsed,
+            files,
+            TASK_RESPONSE_TYPE_ANTHROPIC);
+    };
+
+    this->post_anthropic_count_tokens = [this](const server_http_req & req) {
+        auto res = create_response();
+        std::vector<raw_buffer> files;
+        json body = convert_anthropic_to_oai(json::parse(req.body));
+        SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
+        SRV_DBG("converted request: %s\n", body.dump().c_str());
+        json body_parsed = oaicompat_chat_params_parse(
+            body,
+            meta->chat_params,
+            files);
+
+        json prompt = body_parsed.at("prompt");
+        llama_tokens tokens = tokenize_mixed(ctx_server.vocab, prompt, true, true);
+        res->ok({{"input_tokens", static_cast<int>(tokens.size())}});
+        return res;
+    };
+
+    // same with handle_chat_completions, but without inference part
+    this->post_apply_template = [this](const server_http_req & req) {
+        auto res = create_response();
+        std::vector<raw_buffer> files; // dummy, unused
+        json body = json::parse(req.body);
+        json data = oaicompat_chat_params_parse(
+            body,
+            meta->chat_params,
+            files);
+        res->ok({{ "prompt", std::move(data.at("prompt")) }});
+        return res;
+    };
+
+    this->get_models = [this](const server_http_req &) {
+        auto res = create_response(true);
+
+        // this endpoint can be accessed during sleeping
+        // the next LOC is to avoid someone accidentally use ctx_server
+        bool ctx_server; // do NOT delete this line
+        GGML_UNUSED(ctx_server);
+
+        json models = {
+            {"models", {
+                {
+                    {"name",  meta->model_name},
+                    {"model", meta->model_name},
+                    {"modified_at", ""},
+                    {"size", ""},
+                    {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
+                    {"type", "model"},
+                    {"description", ""},
+                    {"tags", {""}},
+                    {"capabilities", meta->has_mtmd ? json({"completion","multimodal"}) : json({"completion"})},
+                    {"parameters", ""},
+                    {"details", {
+                        {"parent_model", ""},
+                        {"format", "gguf"},
+                        {"family", ""},
+                        {"families", {""}},
+                        {"parameter_size", ""},
+                        {"quantization_level", ""}
+                    }}
+                }
+            }},
+            {"object", "list"},
+            {"data", {
+                {
+                    {"id",       meta->model_name},
+                    {"object",   "model"},
+                    {"created",  std::time(0)},
+                    {"owned_by", "llamacpp"},
+                    {"meta",     {
+                        {"vocab_type",  meta->model_vocab_type},
+                        {"n_vocab",     meta->model_vocab_n_tokens},
+                        {"n_ctx_train", meta->model_n_ctx_train},
+                        {"n_embd",      meta->model_n_embd_inp},
+                        {"n_params",    meta->model_n_params},
+                        {"size",        meta->model_size},
+                    }},
+                },
+            }}
+        };
+
+        res->ok(models);
+        return res;
+    };
+
+    this->post_tokenize = [this](const server_http_req & req) {
+        auto res = create_response();
+        const json body = json::parse(req.body);
+        json tokens_response = json::array();
+        if (body.count("content") != 0) {
+            const bool add_special = json_value(body, "add_special", false);
+            const bool parse_special = json_value(body, "parse_special", true);
+            const bool with_pieces = json_value(body, "with_pieces", false);
+
+            llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special);
+
+            if (with_pieces) {
+                for (const auto& token : tokens) {
+                    std::string piece = common_token_to_piece(ctx_server.vocab, token);
+                    json piece_json;
+
+                    // Check if the piece is valid UTF-8
+                    if (is_valid_utf8(piece)) {
+                        piece_json = piece;
+                    } else {
+                        // If not valid UTF-8, store as array of byte values
+                        piece_json = json::array();
+                        for (unsigned char c : piece) {
+                            piece_json.push_back(static_cast<int>(c));
+                        }
+                    }
+
+                    tokens_response.push_back({
+                        {"id", token},
+                        {"piece", piece_json}
+                    });
+                }
+            } else {
+                tokens_response = tokens;
+            }
+        }
+
+        res->ok(json{{"tokens", std::move(tokens_response)}});
+        return res;
+    };
+
+    this->post_detokenize = [this](const server_http_req & req) {
+        auto res = create_response();
+        const json body = json::parse(req.body);
+
+        std::string content;
+        if (body.count("tokens") != 0) {
+            const llama_tokens tokens = body.at("tokens");
+            content = tokens_to_str(ctx_server.vocab, tokens);
+        }
+
+        res->ok(json{{"content", std::move(content)}});
+        return res;
+    };
+
+    this->post_embeddings = [this](const server_http_req & req) {
+        return handle_embeddings_impl(req, TASK_RESPONSE_TYPE_NONE);
+    };
+
+    this->post_embeddings_oai = [this](const server_http_req & req) {
+        return handle_embeddings_impl(req, TASK_RESPONSE_TYPE_OAI_EMBD);
+    };
+
+    this->post_rerank = [this](const server_http_req & req) {
+        auto res = create_response();
+        if (!params.embedding || params.pooling_type != LLAMA_POOLING_TYPE_RANK) {
+            res->error(format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        const json body = json::parse(req.body);
+
+        // if true, use TEI API format, otherwise use Jina API format
+        // Jina: https://jina.ai/reranker/
+        // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
+        bool is_tei_format = body.contains("texts");
+
+        json query;
+        if (body.count("query") == 1) {
+            query = body.at("query");
+            if (!query.is_string()) {
+                res->error(format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+        } else {
+            res->error(format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        std::vector<std::string> documents = json_value(body, "documents",
+                                             json_value(body, "texts", std::vector<std::string>()));
+        if (documents.empty()) {
+            res->error(format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        int top_n = json_value(body, "top_n", (int)documents.size());
+
+        // create and queue the task
+        json responses = json::array();
+        auto & rd = res->rd;
+        {
+            std::vector<server_task> tasks;
+            tasks.reserve(documents.size());
+            for (size_t i = 0; i < documents.size(); i++) {
+                auto tmp = format_prompt_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]);
+                server_task task = server_task(SERVER_TASK_TYPE_RERANK);
+                task.id     = rd.get_new_id();
+                task.tokens = std::move(tmp);
+                tasks.push_back(std::move(task));
+            }
+            rd.post_tasks(std::move(tasks));
+        }
+
+        // wait for the results
+        auto all_results = rd.wait_for_all(req.should_stop);
+
+        // collect results
+        if (all_results.is_terminated) {
+            return res; // connection is closed
+        } else if (all_results.error) {
+            res->error(all_results.error->to_json());
+            return res;
+        } else {
+            for (auto & res : all_results.results) {
+                GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
+                responses.push_back(res->to_json());
+            }
+        }
+
+        // write JSON response
+        json root = format_response_rerank(
+            body,
+            meta->model_name,
+            responses,
+            is_tei_format,
+            documents,
+            top_n);
+
+        res->ok(root);
+        return res;
+    };
+
+    this->get_lora_adapters = [this](const server_http_req & req) {
+        auto res = create_response();
+
+        auto & rd = res->rd;
+        {
+            server_task task(SERVER_TASK_TYPE_GET_LORA);
+            task.id = rd.get_new_id();
+            rd.post_task(std::move(task));
+        }
+
+        // get the result
+        auto result = rd.next(req.should_stop);
+        if (!result) {
+            // connection was closed
+            GGML_ASSERT(req.should_stop());
+            return res;
+        }
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        GGML_ASSERT(dynamic_cast<server_task_result_get_lora*>(result.get()) != nullptr);
+        res->ok(result->to_json());
+        return res;
+    };
+
+    this->post_lora_adapters = [this](const server_http_req & req) {
+        auto res = create_response();
+        const json body = json::parse(req.body);
+        if (!body.is_array()) {
+            res->error(format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        auto & rd = res->rd;
+        {
+            server_task task(SERVER_TASK_TYPE_SET_LORA);
+            task.id = rd.get_new_id();
+            task.set_lora = parse_lora_request(body);
+            rd.post_task(std::move(task));
+        }
+
+        // get the result
+        auto result = rd.next(req.should_stop);
+        if (!result) {
+            // connection was closed
+            GGML_ASSERT(req.should_stop());
+            return res;
+        }
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        GGML_ASSERT(dynamic_cast<server_task_result_apply_lora*>(result.get()) != nullptr);
+        res->ok(result->to_json());
+        return res;
+    };
+}
+
+std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const server_http_req & req, int id_slot) {
+    auto res = create_response();
+    const json request_data = json::parse(req.body);
+    std::string filename = request_data.at("filename");
+    if (!fs_validate_filename(filename)) {
+        res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
+        return res;
+    }
+    std::string filepath = params.slot_save_path + filename;
+
+    auto & rd = res->rd;
+    {
+        server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
+        task.id = rd.get_new_id();
+        task.slot_action.id_slot  = id_slot;
+        task.slot_action.filename = filename;
+        task.slot_action.filepath = filepath;
+        rd.post_task(std::move(task));
+    }
+
+    auto result = rd.next(req.should_stop);
+    if (!result) {
+        // connection was closed
+        GGML_ASSERT(req.should_stop());
+        return res;
+    }
+
+    if (result->is_error()) {
+        res->error(result->to_json());
+        return res;
+    }
+
+    res->ok(result->to_json());
+    return res;
+}
+
+std::unique_ptr<server_res_generator> server_routes::handle_slots_restore(const server_http_req & req, int id_slot) {
+    auto res = create_response();
+    const json request_data = json::parse(req.body);
+    std::string filename = request_data.at("filename");
+    if (!fs_validate_filename(filename)) {
+        res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
+        return res;
+    }
+    std::string filepath = params.slot_save_path + filename;
+
+    auto & rd = res->rd;
+    {
+        server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
+        task.id = rd.get_new_id();
+        task.slot_action.id_slot  = id_slot;
+        task.slot_action.filename = filename;
+        task.slot_action.filepath = filepath;
+        rd.post_task(std::move(task));
+    }
+
+    auto result = rd.next(req.should_stop);
+    if (!result) {
+        // connection was closed
+        GGML_ASSERT(req.should_stop());
+        return res;
+    }
+
+    if (result->is_error()) {
+        res->error(result->to_json());
+        return res;
+    }
+
+    GGML_ASSERT(dynamic_cast<server_task_result_slot_save_load*>(result.get()) != nullptr);
+    res->ok(result->to_json());
+    return res;
+}
+
+std::unique_ptr<server_res_generator> server_routes::handle_slots_erase(const server_http_req & req, int id_slot) {
+    auto res = create_response();
+    auto & rd = res->rd;
+    {
+        server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
+        task.id = rd.get_new_id();
+        task.slot_action.id_slot = id_slot;
+        rd.post_task(std::move(task));
+    }
+
+    auto result = rd.next(req.should_stop);
+    if (!result) {
+        // connection was closed
+        GGML_ASSERT(req.should_stop());
+        return res;
+    }
+
+    if (result->is_error()) {
+        res->error(result->to_json());
+        return res;
+    }
+
+    GGML_ASSERT(dynamic_cast<server_task_result_slot_erase*>(result.get()) != nullptr);
+    res->ok(result->to_json());
+    return res;
+}
+
+std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(const server_http_req & req, task_response_type res_type) {
+    auto res = create_response();
+    if (!params.embedding) {
+        res->error(format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
+        return res;
+    }
+
+    if (res_type != TASK_RESPONSE_TYPE_NONE && meta->pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        res->error(format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
+        return res;
+    }
+
+    const json body = json::parse(req.body);
+
+    // for the shape of input/content, see tokenize_input_prompts()
+    json prompt;
+    if (body.count("input") != 0) {
+        prompt = body.at("input");
+    } else if (body.contains("content")) {
+        res_type = TASK_RESPONSE_TYPE_NONE; // "content" field is not OAI compatible
+        prompt = body.at("content");
+    } else {
+        res->error(format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
+        return res;
+    }
+
+    bool use_base64 = false;
+    if (body.count("encoding_format") != 0) {
+        const std::string & format = body.at("encoding_format");
+        if (format == "base64") {
+            use_base64 = true;
+        } else if (format != "float") {
+            res->error(format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+    }
+
+    auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
+    for (const auto & tokens : tokenized_prompts) {
+        // this check is necessary for models that do not add BOS token to the input
+        if (tokens.empty()) {
+            res->error(format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+    }
+
+    int embd_normalize = 2; // default to Euclidean/L2 norm
+    if (body.count("embd_normalize") != 0) {
+        embd_normalize = body.at("embd_normalize");
+        if (meta->pooling_type == LLAMA_POOLING_TYPE_NONE) {
+            SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", meta->pooling_type);
+        }
+    }
+
+    // create and queue the task
+    json responses = json::array();
+    auto & rd = res->rd;
+    {
+        std::vector<server_task> tasks;
+        for (size_t i = 0; i < tokenized_prompts.size(); i++) {
+            server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING);
+
+            task.id     = rd.get_new_id();
+            task.tokens = std::move(tokenized_prompts[i]);
+
+            // OAI-compat
+            task.params.res_type = res_type;
+            task.params.embd_normalize = embd_normalize;
+
+            tasks.push_back(std::move(task));
+        }
+        rd.post_tasks(std::move(tasks));
+    }
+
+    // wait for the results
+    auto all_results = rd.wait_for_all(req.should_stop);
+
+    // collect results
+    if (all_results.is_terminated) {
+        return res; // connection is closed
+    } else if (all_results.error) {
+        res->error(all_results.error->to_json());
+        return res;
+    } else {
+        for (auto & res : all_results.results) {
+            GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
+            responses.push_back(res->to_json());
+        }
+    }
+
+    // write JSON response
+    json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD
+        ? format_embeddings_response_oaicompat(body, meta->model_name, responses, use_base64)
+        : json(responses);
+    res->ok(root);
+    return res;
+}
diff --git a/llama.cpp/tools/server/server-context.h b/llama.cpp/tools/server/server-context.h
new file mode 100644
index 0000000..c0b5d37
--- /dev/null
+++ b/llama.cpp/tools/server/server-context.h
@@ -0,0 +1,131 @@
+#include "server-http.h"
+#include "server-task.h"
+#include "server-queue.h"
+
+#include <nlohmann/json_fwd.hpp>
+
+#include <cstddef>
+#include <memory>
+
+struct server_context_impl; // private implementation
+
+struct server_context_meta {
+    std::string build_info;
+    std::string model_name;
+    std::string model_path;
+    bool has_mtmd;
+    bool has_inp_image;
+    bool has_inp_audio;
+    json json_webui_settings;
+    int slot_n_ctx;
+    enum llama_pooling_type pooling_type;
+
+    // chat params
+    server_chat_params & chat_params;
+    std::map<std::string, bool> chat_template_caps;
+
+    // tokens
+    std::string bos_token_str;
+    std::string eos_token_str;
+    llama_token fim_pre_token;
+    llama_token fim_sub_token;
+    llama_token fim_mid_token;
+
+    // model meta
+    enum llama_vocab_type model_vocab_type;
+    int32_t model_vocab_n_tokens;
+    int32_t model_n_ctx_train;
+    int32_t model_n_embd_inp;
+    uint64_t model_n_params;
+    uint64_t model_size;
+};
+
+struct server_context {
+    std::unique_ptr<server_context_impl> impl;
+
+    server_context();
+    ~server_context();
+
+    // load the model and initialize llama_context
+    // returns true on success
+    bool load_model(const common_params & params);
+
+    // this function will block main thread until termination
+    void start_loop();
+
+    // terminate main loop (will unblock start_loop)
+    void terminate();
+
+    // get the underlaying llama_context, can return nullptr if sleeping
+    // not thread-safe, should only be used from the main thread
+    llama_context * get_llama_context() const;
+
+    // get a new response reader, used by CLI application
+    server_response_reader get_response_reader();
+
+    // get server metadata (read-only), can only be called after load_model()
+    // not thread-safe, should only be used from the main thread
+    server_context_meta get_meta() const;
+};
+
+
+// forward declarations
+struct server_res_generator;
+
+struct server_routes {
+    server_routes(const common_params & params, server_context & ctx_server);
+
+    void init_routes();
+
+    // note: this is not thread-safe and can only when ctx_http.is_ready is false
+    void update_meta(const server_context & ctx_server) {
+        this->meta = std::make_unique<server_context_meta>(ctx_server.get_meta());
+    }
+
+    // handlers using lambda function, so that they can capture `this` without `std::bind`
+    // they won't be called until ctx_http.is_ready is set to true
+    server_http_context::handler_t get_health;
+    server_http_context::handler_t get_metrics;
+    server_http_context::handler_t get_slots;
+    server_http_context::handler_t post_slots;
+    server_http_context::handler_t get_props;
+    server_http_context::handler_t post_props;
+    server_http_context::handler_t get_api_show;
+    server_http_context::handler_t post_infill;
+    server_http_context::handler_t post_completions;
+    server_http_context::handler_t post_completions_oai;
+    server_http_context::handler_t post_chat_completions;
+    server_http_context::handler_t post_responses_oai;
+    server_http_context::handler_t post_anthropic_messages;
+    server_http_context::handler_t post_anthropic_count_tokens;
+    server_http_context::handler_t post_apply_template;
+    server_http_context::handler_t get_models;
+    server_http_context::handler_t post_tokenize;
+    server_http_context::handler_t post_detokenize;
+    server_http_context::handler_t post_embeddings;
+    server_http_context::handler_t post_embeddings_oai;
+    server_http_context::handler_t post_rerank;
+    server_http_context::handler_t get_lora_adapters;
+    server_http_context::handler_t post_lora_adapters;
+private:
+    std::unique_ptr<server_res_generator> handle_completions_impl(
+            const server_http_req & req,
+            server_task_type type,
+            const json & data,
+            const std::vector<raw_buffer> & files,
+            task_response_type res_type);
+    std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
+    std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
+    std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
+    std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
+
+    // using unique_ptr to allow late initialization of const
+    std::unique_ptr<const server_context_meta> meta;
+
+    const common_params & params;
+    const server_context_impl & ctx_server;
+
+    server_queue & queue_tasks;
+    server_response & queue_results;
+    std::unique_ptr<server_res_generator> create_response(bool bypass_sleep = false);
+};
diff --git a/llama.cpp/tools/server/server-http.cpp b/llama.cpp/tools/server/server-http.cpp
new file mode 100644
index 0000000..00897ee
--- /dev/null
+++ b/llama.cpp/tools/server/server-http.cpp
@@ -0,0 +1,406 @@
+#include "common.h"
+#include "server-http.h"
+#include "server-common.h"
+
+#include <cpp-httplib/httplib.h>
+
+#include <functional>
+#include <string>
+#include <thread>
+
+// auto generated files (see README.md for details)
+#include "index.html.gz.hpp"
+#include "loading.html.hpp"
+
+//
+// HTTP implementation using cpp-httplib
+//
+
+class server_http_context::Impl {
+public:
+    std::unique_ptr<httplib::Server> srv;
+};
+
+server_http_context::server_http_context()
+    : pimpl(std::make_unique<server_http_context::Impl>())
+{}
+
+server_http_context::~server_http_context() = default;
+
+static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
+    // skip logging requests that are regularly sent, to avoid log spam
+    if (req.path == "/health"
+        || req.path == "/v1/health"
+        || req.path == "/models"
+        || req.path == "/v1/models"
+        || req.path == "/props"
+        || req.path == "/metrics"
+    ) {
+        return;
+    }
+
+    // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
+
+    SRV_INF("done request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
+
+    SRV_DBG("request:  %s\n", req.body.c_str());
+    SRV_DBG("response: %s\n", res.body.c_str());
+}
+
+bool server_http_context::init(const common_params & params) {
+    path_prefix = params.api_prefix;
+    port = params.port;
+    hostname = params.hostname;
+
+    auto & srv = pimpl->srv;
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+        LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
+        srv.reset(
+            new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
+        );
+    } else {
+        LOG_INF("Running without SSL\n");
+        srv.reset(new httplib::Server());
+    }
+#else
+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+        LOG_ERR("Server is built without SSL support\n");
+        return false;
+    }
+    srv.reset(new httplib::Server());
+#endif
+
+    srv->set_default_headers({{"Server", "llama.cpp"}});
+    srv->set_logger(log_server_request);
+    srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
+        // this is fail-safe; exceptions should already handled by `ex_wrapper`
+
+        std::string message;
+        try {
+            std::rethrow_exception(ep);
+        } catch (const std::exception & e) {
+            message = e.what();
+        } catch (...) {
+            message = "Unknown Exception";
+        }
+
+        res.status = 500;
+        res.set_content(message, "text/plain");
+        LOG_ERR("got exception: %s\n", message.c_str());
+    });
+
+    srv->set_error_handler([](const httplib::Request &, httplib::Response & res) {
+        if (res.status == 404) {
+            res.set_content(
+                safe_json_to_str(json {
+                    {"error", {
+                        {"message", "File Not Found"},
+                        {"type", "not_found_error"},
+                        {"code", 404}
+                    }}
+                }),
+                "application/json; charset=utf-8"
+            );
+        }
+        // for other error codes, we skip processing here because it's already done by res->error()
+    });
+
+    // set timeouts and change hostname and port
+    srv->set_read_timeout (params.timeout_read);
+    srv->set_write_timeout(params.timeout_write);
+
+    if (params.api_keys.size() == 1) {
+        auto key = params.api_keys[0];
+        std::string substr = key.substr(std::max((int)(key.length() - 4), 0));
+        LOG_INF("%s: api_keys: ****%s\n", __func__, substr.c_str());
+    } else if (params.api_keys.size() > 1) {
+        LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size());
+    }
+
+    //
+    // Middlewares
+    //
+
+    auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) {
+        static const std::unordered_set<std::string> public_endpoints = {
+            "/health",
+            "/v1/health",
+            "/models",
+            "/v1/models",
+            "/api/tags"
+        };
+
+        // If API key is not set, skip validation
+        if (api_keys.empty()) {
+            return true;
+        }
+
+        // If path is public or is static file, skip validation
+        if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") {
+            return true;
+        }
+
+        // Check for API key in the Authorization header
+        std::string req_api_key = req.get_header_value("Authorization");
+        if (req_api_key.empty()) {
+            // retry with anthropic header
+            req_api_key = req.get_header_value("X-Api-Key");
+        }
+
+        // remove the "Bearer " prefix if needed
+        std::string prefix = "Bearer ";
+        if (req_api_key.substr(0, prefix.size()) == prefix) {
+            req_api_key = req_api_key.substr(prefix.size());
+        }
+
+        // validate the API key
+        if (std::find(api_keys.begin(), api_keys.end(), req_api_key) != api_keys.end()) {
+            return true; // API key is valid
+        }
+
+        // API key is invalid or not provided
+        res.status = 401;
+        res.set_content(
+            safe_json_to_str(json {
+                {"error", {
+                    {"message", "Invalid API Key"},
+                    {"type", "authentication_error"},
+                    {"code", 401}
+                }}
+            }),
+            "application/json; charset=utf-8"
+        );
+
+        LOG_WRN("Unauthorized: Invalid API Key\n");
+
+        return false;
+    };
+
+    auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) {
+        bool ready = is_ready.load();
+        if (!ready) {
+            auto tmp = string_split<std::string>(req.path, '.');
+            if (req.path == "/" || tmp.back() == "html") {
+                res.status = 503;
+                res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
+            } else {
+                // no endpoints is allowed to be accessed when the server is not ready
+                // this is to prevent any data races or inconsistent states
+                res.status = 503;
+                res.set_content(
+                    safe_json_to_str(json {
+                        {"error", {
+                            {"message", "Loading model"},
+                            {"type", "unavailable_error"},
+                            {"code", 503}
+                        }}
+                    }),
+                    "application/json; charset=utf-8"
+                );
+            }
+            return false;
+        }
+        return true;
+    };
+
+    // register server middlewares
+    srv->set_pre_routing_handler([middleware_validate_api_key, middleware_server_state](const httplib::Request & req, httplib::Response & res) {
+        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+        // If this is OPTIONS request, skip validation because browsers don't include Authorization header
+        if (req.method == "OPTIONS") {
+            res.set_header("Access-Control-Allow-Credentials", "true");
+            res.set_header("Access-Control-Allow-Methods",     "GET, POST");
+            res.set_header("Access-Control-Allow-Headers",     "*");
+            res.set_content("", "text/html"); // blank response, no data
+            return httplib::Server::HandlerResponse::Handled; // skip further processing
+        }
+        if (!middleware_server_state(req, res)) {
+            return httplib::Server::HandlerResponse::Handled;
+        }
+        if (!middleware_validate_api_key(req, res)) {
+            return httplib::Server::HandlerResponse::Handled;
+        }
+        return httplib::Server::HandlerResponse::Unhandled;
+    });
+
+    int n_threads_http = params.n_threads_http;
+    if (n_threads_http < 1) {
+        // +2 threads for monitoring endpoints
+        n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+    }
+    LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http);
+    srv->new_task_queue = [n_threads_http] { return new httplib::ThreadPool(n_threads_http); };
+
+    //
+    // Web UI setup
+    //
+
+    if (!params.webui) {
+        LOG_INF("Web UI is disabled\n");
+    } else {
+        // register static assets routes
+        if (!params.public_path.empty()) {
+            // Set the base directory for serving static files
+            bool is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path);
+            if (!is_found) {
+                LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
+                return 1;
+            }
+        } else {
+            // using embedded static index.html
+            srv->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) {
+                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
+                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
+                } else {
+                    res.set_header("Content-Encoding", "gzip");
+                    // COEP and COOP headers, required by pyodide (python interpreter)
+                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
+                }
+                return false;
+            });
+        }
+    }
+    return true;
+}
+
+bool server_http_context::start() {
+    // Bind and listen
+
+    auto & srv = pimpl->srv;
+    bool was_bound = false;
+    bool is_sock = false;
+    if (string_ends_with(std::string(hostname), ".sock")) {
+        is_sock = true;
+        LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
+        srv->set_address_family(AF_UNIX);
+        // bind_to_port requires a second arg, any value other than 0 should
+        // simply get ignored
+        was_bound = srv->bind_to_port(hostname, 8080);
+    } else {
+        LOG_INF("%s: binding port with default address family\n", __func__);
+        // bind HTTP listen port
+        if (port == 0) {
+            int bound_port = srv->bind_to_any_port(hostname);
+            was_bound = (bound_port >= 0);
+            if (was_bound) {
+                port = bound_port;
+            }
+        } else {
+            was_bound = srv->bind_to_port(hostname, port);
+        }
+    }
+
+    if (!was_bound) {
+        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, hostname.c_str(), port);
+        return false;
+    }
+
+    // run the HTTP server in a thread
+    thread = std::thread([this]() { pimpl->srv->listen_after_bind(); });
+    srv->wait_until_ready();
+
+    listening_address = is_sock ? string_format("unix://%s",    hostname.c_str())
+                                : string_format("http://%s:%d", hostname.c_str(), port);
+    return true;
+}
+
+void server_http_context::stop() const {
+    if (pimpl->srv) {
+        pimpl->srv->stop();
+    }
+}
+
+static void set_headers(httplib::Response & res, const std::map<std::string, std::string> & headers) {
+    for (const auto & [key, value] : headers) {
+        res.set_header(key, value);
+    }
+}
+
+static std::map<std::string, std::string> get_params(const httplib::Request & req) {
+    std::map<std::string, std::string> params;
+    for (const auto & [key, value] : req.params) {
+        params[key] = value;
+    }
+    for (const auto & [key, value] : req.path_params) {
+        params[key] = value;
+    }
+    return params;
+}
+
+static std::map<std::string, std::string> get_headers(const httplib::Request & req) {
+    std::map<std::string, std::string> headers;
+    for (const auto & [key, value] : req.headers) {
+        headers[key] = value;
+    }
+    return headers;
+}
+
+// using unique_ptr for request to allow safe capturing in lambdas
+using server_http_req_ptr = std::unique_ptr<server_http_req>;
+
+static void process_handler_response(server_http_req_ptr && request, server_http_res_ptr & response, httplib::Response & res) {
+    if (response->is_stream()) {
+        res.status = response->status;
+        set_headers(res, response->headers);
+        std::string content_type = response->content_type;
+        // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
+        std::shared_ptr<server_http_req> q_ptr = std::move(request);
+        std::shared_ptr<server_http_res> r_ptr = std::move(response);
+        const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
+            std::string chunk;
+            bool has_next = response->next(chunk);
+            if (!chunk.empty()) {
+                // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed()
+                sink.write(chunk.data(), chunk.size());
+                SRV_DBG("http: streamed chunk: %s\n", chunk.c_str());
+            }
+            if (!has_next) {
+                sink.done();
+                SRV_DBG("%s", "http: stream ended\n");
+            }
+            return has_next;
+        };
+        const auto on_complete = [request = q_ptr, response = r_ptr](bool) mutable {
+            response.reset(); // trigger the destruction of the response object
+            request.reset();  // trigger the destruction of the request object
+        };
+        res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete);
+    } else {
+        res.status = response->status;
+        set_headers(res, response->headers);
+        res.set_content(response->data, response->content_type);
+    }
+}
+
+void server_http_context::get(const std::string & path, const server_http_context::handler_t & handler) const {
+    pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
+        server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
+            get_params(req),
+            get_headers(req),
+            req.path,
+            req.body,
+            req.is_connection_closed
+        });
+        server_http_res_ptr response = handler(*request);
+        process_handler_response(std::move(request), response, res);
+    });
+}
+
+void server_http_context::post(const std::string & path, const server_http_context::handler_t & handler) const {
+    pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
+        server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
+            get_params(req),
+            get_headers(req),
+            req.path,
+            req.body,
+            req.is_connection_closed
+        });
+        server_http_res_ptr response = handler(*request);
+        process_handler_response(std::move(request), response, res);
+    });
+}
+
diff --git a/llama.cpp/tools/server/server-http.h b/llama.cpp/tools/server/server-http.h
new file mode 100644
index 0000000..24c0b40
--- /dev/null
+++ b/llama.cpp/tools/server/server-http.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <map>
+#include <string>
+#include <thread>
+
+struct common_params;
+
+// generator-like API for HTTP response generation
+// this object response with one of the 2 modes:
+// 1) normal response: `data` contains the full response body
+// 2) streaming response: each call to next(output) generates the next chunk
+//    when next(output) returns false, no more data after the current chunk
+//    note: some chunks can be empty, in which case no data is sent for that chunk
+struct server_http_res {
+    std::string content_type = "application/json; charset=utf-8";
+    int status = 200;
+    std::string data;
+    std::map<std::string, std::string> headers;
+
+    // TODO: move this to a virtual function once we have proper polymorphism support
+    std::function<bool(std::string &)> next = nullptr;
+    bool is_stream() const {
+        return next != nullptr;
+    }
+
+    virtual ~server_http_res() = default;
+};
+
+// unique pointer, used by set_chunked_content_provider
+// httplib requires the stream provider to be stored in heap
+using server_http_res_ptr = std::unique_ptr<server_http_res>;
+
+struct server_http_req {
+    std::map<std::string, std::string> params; // path_params + query_params
+    std::map<std::string, std::string> headers; // reserved for future use
+    std::string path; // reserved for future use
+    std::string body;
+    const std::function<bool()> & should_stop;
+
+    std::string get_param(const std::string & key, const std::string & def = "") const {
+        auto it = params.find(key);
+        if (it != params.end()) {
+            return it->second;
+        }
+        return def;
+    }
+};
+
+struct server_http_context {
+    class Impl;
+    std::unique_ptr<Impl> pimpl;
+
+    std::thread thread; // server thread
+    std::atomic<bool> is_ready = false;
+
+    std::string path_prefix;
+    std::string hostname;
+    int port;
+
+    server_http_context();
+    ~server_http_context();
+
+    bool init(const common_params & params);
+    bool start();
+    void stop() const;
+
+    // note: the handler should never throw exceptions
+    using handler_t = std::function<server_http_res_ptr(const server_http_req & req)>;
+
+    void get(const std::string & path, const handler_t & handler) const;
+    void post(const std::string & path, const handler_t & handler) const;
+
+    // for debugging
+    std::string listening_address;
+};
diff --git a/llama.cpp/tools/server/server-models.cpp b/llama.cpp/tools/server/server-models.cpp
new file mode 100644
index 0000000..5765547
--- /dev/null
+++ b/llama.cpp/tools/server/server-models.cpp
@@ -0,0 +1,1092 @@
+#include "server-common.h"
+#include "server-models.h"
+
+#include "preset.h"
+#include "download.h"
+
+#include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
+#include <sheredom/subprocess.h>
+
+#include <functional>
+#include <algorithm>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <cstring>
+#include <atomic>
+#include <chrono>
+#include <queue>
+#include <filesystem>
+#include <cstring>
+
+#ifdef _WIN32
+#include <winsock2.h>
+#include <windows.h>
+#else
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+extern char **environ;
+#endif
+
+#if defined(__APPLE__) && defined(__MACH__)
+// macOS: use _NSGetExecutablePath to get the executable path
+#include <mach-o/dyld.h>
+#include <limits.h>
+#endif
+
+#define DEFAULT_STOP_TIMEOUT 10 // seconds
+
+#define CMD_ROUTER_TO_CHILD_EXIT  "cmd_router_to_child:exit"
+#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready"
+
+// address for child process, this is needed because router may run on 0.0.0.0
+// ref: https://github.com/ggml-org/llama.cpp/issues/17862
+#define CHILD_ADDR "127.0.0.1"
+
+static std::filesystem::path get_server_exec_path() {
+#if defined(_WIN32)
+    wchar_t buf[32768] = { 0 };  // Large buffer to handle long paths
+    DWORD len = GetModuleFileNameW(nullptr, buf, _countof(buf));
+    if (len == 0 || len >= _countof(buf)) {
+        throw std::runtime_error("GetModuleFileNameW failed or path too long");
+    }
+    return std::filesystem::path(buf);
+#elif defined(__APPLE__) && defined(__MACH__)
+    char small_path[PATH_MAX];
+    uint32_t size = sizeof(small_path);
+
+    if (_NSGetExecutablePath(small_path, &size) == 0) {
+        // resolve any symlinks to get absolute path
+        try {
+            return std::filesystem::canonical(std::filesystem::path(small_path));
+        } catch (...) {
+            return std::filesystem::path(small_path);
+        }
+    } else {
+        // buffer was too small, allocate required size and call again
+        std::vector<char> buf(size);
+        if (_NSGetExecutablePath(buf.data(), &size) == 0) {
+            try {
+                return std::filesystem::canonical(std::filesystem::path(buf.data()));
+            } catch (...) {
+                return std::filesystem::path(buf.data());
+            }
+        }
+        throw std::runtime_error("_NSGetExecutablePath failed after buffer resize");
+    }
+#else
+    char path[FILENAME_MAX];
+    ssize_t count = readlink("/proc/self/exe", path, FILENAME_MAX);
+    if (count <= 0) {
+        throw std::runtime_error("failed to resolve /proc/self/exe");
+    }
+    return std::filesystem::path(std::string(path, count));
+#endif
+}
+
+static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
+    preset.unset_option("LLAMA_ARG_SSL_KEY_FILE");
+    preset.unset_option("LLAMA_ARG_SSL_CERT_FILE");
+    preset.unset_option("LLAMA_API_KEY");
+    preset.unset_option("LLAMA_ARG_MODELS_DIR");
+    preset.unset_option("LLAMA_ARG_MODELS_MAX");
+    preset.unset_option("LLAMA_ARG_MODELS_PRESET");
+    preset.unset_option("LLAMA_ARG_MODELS_AUTOLOAD");
+    if (unset_model_args) {
+        preset.unset_option("LLAMA_ARG_MODEL");
+        preset.unset_option("LLAMA_ARG_MMPROJ");
+        preset.unset_option("LLAMA_ARG_HF_REPO");
+    }
+}
+
+#ifdef _WIN32
+static std::string wide_to_utf8(const wchar_t * ws) {
+    if (!ws || !*ws) {
+        return {};
+    }
+
+    const int len = static_cast<int>(std::wcslen(ws));
+    const int bytes = WideCharToMultiByte(CP_UTF8, 0, ws, len, nullptr, 0, nullptr, nullptr);
+    if (bytes == 0) {
+        return {};
+    }
+
+    std::string utf8(bytes, '\0');
+    WideCharToMultiByte(CP_UTF8, 0, ws, len, utf8.data(), bytes, nullptr, nullptr);
+
+    return utf8;
+}
+#endif
+
+static std::vector<std::string> get_environment() {
+    std::vector<std::string> env;
+
+#ifdef _WIN32
+    LPWCH env_block = GetEnvironmentStringsW();
+    if (!env_block) {
+        return env;
+    }
+    for (LPWCH e = env_block; *e; e += wcslen(e) + 1) {
+        env.emplace_back(wide_to_utf8(e));
+    }
+    FreeEnvironmentStringsW(env_block);
+#else
+    if (environ == nullptr) {
+        return env;
+    }
+    for (char ** e = environ; *e != nullptr; e++) {
+        env.emplace_back(*e);
+    }
+#endif
+
+    return env;
+}
+
+void server_model_meta::update_args(common_preset_context & ctx_preset, std::string bin_path) {
+    // update params
+    unset_reserved_args(preset, false);
+    preset.set_option(ctx_preset, "LLAMA_ARG_HOST",  CHILD_ADDR);
+    preset.set_option(ctx_preset, "LLAMA_ARG_PORT",  std::to_string(port));
+    preset.set_option(ctx_preset, "LLAMA_ARG_ALIAS", name);
+    // TODO: maybe validate preset before rendering ?
+    // render args
+    args = preset.to_args(bin_path);
+}
+
+//
+// server_models
+//
+
+server_models::server_models(
+        const common_params & params,
+        int argc,
+        char ** argv)
+            : ctx_preset(LLAMA_EXAMPLE_SERVER),
+              base_params(params),
+              base_env(get_environment()),
+              base_preset(ctx_preset.load_from_args(argc, argv)) {
+    // clean up base preset
+    unset_reserved_args(base_preset, true);
+    // set binary path
+    try {
+        bin_path = get_server_exec_path().string();
+    } catch (const std::exception & e) {
+        bin_path = argv[0];
+        LOG_WRN("failed to get server executable path: %s\n", e.what());
+        LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
+    }
+    load_models();
+}
+
+void server_models::add_model(server_model_meta && meta) {
+    if (mapping.find(meta.name) != mapping.end()) {
+        throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
+    }
+    meta.update_args(ctx_preset, bin_path); // render args
+    std::string name = meta.name;
+    mapping[name] = instance_t{
+        /* subproc */ std::make_shared<subprocess_s>(),
+        /* th      */ std::thread(),
+        /* meta    */ std::move(meta)
+    };
+}
+
+// TODO: allow refreshing cached model list
+void server_models::load_models() {
+    // loading models from 3 sources:
+    // 1. cached models
+    common_presets cached_models = ctx_preset.load_from_cache();
+    SRV_INF("Loaded %zu cached model presets\n", cached_models.size());
+    // 2. local models from --models-dir
+    common_presets local_models;
+    if (!base_params.models_dir.empty()) {
+        local_models = ctx_preset.load_from_models_dir(base_params.models_dir);
+        SRV_INF("Loaded %zu local model presets from %s\n", local_models.size(), base_params.models_dir.c_str());
+    }
+    // 3. custom-path models from presets
+    common_preset global = {};
+    common_presets custom_presets = {};
+    if (!base_params.models_preset.empty()) {
+        custom_presets = ctx_preset.load_from_ini(base_params.models_preset, global);
+        SRV_INF("Loaded %zu custom model presets from %s\n", custom_presets.size(), base_params.models_preset.c_str());
+    }
+
+    // cascade, apply global preset first
+    cached_models  = ctx_preset.cascade(global, cached_models);
+    local_models   = ctx_preset.cascade(global, local_models);
+    custom_presets = ctx_preset.cascade(global, custom_presets);
+
+    // note: if a model exists in both cached and local, local takes precedence
+    common_presets final_presets;
+    for (const auto & [name, preset] : cached_models) {
+        final_presets[name] = preset;
+    }
+    for (const auto & [name, preset] : local_models) {
+        final_presets[name] = preset;
+    }
+
+    // process custom presets from INI
+    for (const auto & [name, custom] : custom_presets) {
+        if (final_presets.find(name) != final_presets.end()) {
+            // apply custom config if exists
+            common_preset & target = final_presets[name];
+            target.merge(custom);
+        } else {
+            // otherwise add directly
+            final_presets[name] = custom;
+        }
+    }
+
+    // server base preset from CLI args take highest precedence
+    for (auto & [name, preset] : final_presets) {
+        preset.merge(base_preset);
+    }
+
+    // convert presets to server_model_meta and add to mapping
+    for (const auto & preset : final_presets) {
+        server_model_meta meta{
+            /* preset       */ preset.second,
+            /* name         */ preset.first,
+            /* port         */ 0,
+            /* status       */ SERVER_MODEL_STATUS_UNLOADED,
+            /* last_used    */ 0,
+            /* args         */ std::vector<std::string>(),
+            /* exit_code    */ 0,
+            /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
+        };
+        add_model(std::move(meta));
+    }
+
+    // log available models
+    {
+        std::unordered_set<std::string> custom_names;
+        for (const auto & [name, preset] : custom_presets) {
+            custom_names.insert(name);
+        }
+        SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
+        for (const auto & [name, inst] : mapping) {
+            bool has_custom = custom_names.find(name) != custom_names.end();
+            SRV_INF("  %c %s\n", has_custom ? '*' : ' ', name.c_str());
+        }
+    }
+
+    // handle custom stop-timeout option
+    for (auto & [name, inst] : mapping) {
+        std::string val;
+        if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) {
+            try {
+                inst.meta.stop_timeout = std::stoi(val);
+            } catch (...) {
+                SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n",
+                    val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT);
+                inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT;
+            }
+        }
+    }
+
+    // load any autoload models
+    std::vector<std::string> models_to_load;
+    for (const auto & [name, inst] : mapping) {
+        std::string val;
+        if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) {
+            models_to_load.push_back(name);
+        }
+    }
+    if ((int)models_to_load.size() > base_params.models_max) {
+        throw std::runtime_error(string_format(
+            "number of models to load on startup (%zu) exceeds models_max (%d)",
+            models_to_load.size(),
+            base_params.models_max
+        ));
+    }
+    for (const auto & name : models_to_load) {
+        SRV_INF("(startup) loading model %s\n", name.c_str());
+        load(name);
+    }
+}
+
+void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
+    std::lock_guard<std::mutex> lk(mutex);
+    auto it = mapping.find(name);
+    if (it != mapping.end()) {
+        it->second.meta = meta;
+    }
+    cv.notify_all(); // notify wait_until_loaded
+}
+
+bool server_models::has_model(const std::string & name) {
+    std::lock_guard<std::mutex> lk(mutex);
+    return mapping.find(name) != mapping.end();
+}
+
+std::optional<server_model_meta> server_models::get_meta(const std::string & name) {
+    std::lock_guard<std::mutex> lk(mutex);
+    auto it = mapping.find(name);
+    if (it != mapping.end()) {
+        return it->second.meta;
+    }
+    return std::nullopt;
+}
+
+static int get_free_port() {
+#ifdef _WIN32
+    WSADATA wsaData;
+    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
+        return -1;
+    }
+    typedef SOCKET native_socket_t;
+#define INVALID_SOCKET_VAL INVALID_SOCKET
+#define CLOSE_SOCKET(s) closesocket(s)
+#else
+    typedef int native_socket_t;
+#define INVALID_SOCKET_VAL -1
+#define CLOSE_SOCKET(s) close(s)
+#endif
+
+    native_socket_t sock = socket(AF_INET, SOCK_STREAM, 0);
+    if (sock == INVALID_SOCKET_VAL) {
+#ifdef _WIN32
+        WSACleanup();
+#endif
+        return -1;
+    }
+
+    struct sockaddr_in serv_addr;
+    std::memset(&serv_addr, 0, sizeof(serv_addr));
+    serv_addr.sin_family = AF_INET;
+    serv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+    serv_addr.sin_port = htons(0);
+
+    if (bind(sock, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) != 0) {
+        CLOSE_SOCKET(sock);
+#ifdef _WIN32
+        WSACleanup();
+#endif
+        return -1;
+    }
+
+#ifdef _WIN32
+    int namelen = sizeof(serv_addr);
+#else
+    socklen_t namelen = sizeof(serv_addr);
+#endif
+    if (getsockname(sock, (struct sockaddr*)&serv_addr, &namelen) != 0) {
+        CLOSE_SOCKET(sock);
+#ifdef _WIN32
+        WSACleanup();
+#endif
+        return -1;
+    }
+
+    int port = ntohs(serv_addr.sin_port);
+
+    CLOSE_SOCKET(sock);
+#ifdef _WIN32
+    WSACleanup();
+#endif
+
+    return port;
+}
+
+// helper to convert vector<string> to char **
+// pointers are only valid as long as the original vector is valid
+static std::vector<char *> to_char_ptr_array(const std::vector<std::string> & vec) {
+    std::vector<char *> result;
+    result.reserve(vec.size() + 1);
+    for (const auto & s : vec) {
+        result.push_back(const_cast<char*>(s.c_str()));
+    }
+    result.push_back(nullptr);
+    return result;
+}
+
+std::vector<server_model_meta> server_models::get_all_meta() {
+    std::lock_guard<std::mutex> lk(mutex);
+    std::vector<server_model_meta> result;
+    result.reserve(mapping.size());
+    for (const auto & [name, inst] : mapping) {
+        result.push_back(inst.meta);
+    }
+    return result;
+}
+
+void server_models::unload_lru() {
+    if (base_params.models_max <= 0) {
+        return; // no limit
+    }
+    // remove one of the servers if we passed the models_max (least recently used - LRU)
+    std::string lru_model_name = "";
+    int64_t lru_last_used = ggml_time_ms();
+    size_t count_active = 0;
+    {
+        std::unique_lock<std::mutex> lk(mutex);
+        for (const auto & m : mapping) {
+            if (m.second.meta.is_active()) {
+                count_active++;
+                if (m.second.meta.last_used < lru_last_used) {
+                    lru_model_name = m.first;
+                    lru_last_used = m.second.meta.last_used;
+                }
+            }
+        }
+    }
+    if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
+        SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
+        unload(lru_model_name);
+        // wait for unload to complete
+        {
+            std::unique_lock<std::mutex> lk(mutex);
+            cv.wait(lk, [this, &lru_model_name]() {
+                return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
+            });
+        }
+    }
+}
+
+void server_models::load(const std::string & name) {
+    if (!has_model(name)) {
+        throw std::runtime_error("model name=" + name + " is not found");
+    }
+    unload_lru();
+
+    std::lock_guard<std::mutex> lk(mutex);
+
+    auto meta = mapping[name].meta;
+    if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
+        SRV_INF("model %s is not ready\n", name.c_str());
+        return;
+    }
+
+    // prepare new instance info
+    instance_t inst;
+    inst.meta           = meta;
+    inst.meta.port      = get_free_port();
+    inst.meta.status    = SERVER_MODEL_STATUS_LOADING;
+    inst.meta.last_used = ggml_time_ms();
+
+    if (inst.meta.port <= 0) {
+        throw std::runtime_error("failed to get a port number");
+    }
+
+    inst.subproc = std::make_shared<subprocess_s>();
+    {
+        SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
+
+        inst.meta.update_args(ctx_preset, bin_path); // render args
+
+        std::vector<std::string> child_args = inst.meta.args; // copy
+        std::vector<std::string> child_env  = base_env; // copy
+        child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
+
+        SRV_INF("%s", "spawning server instance with args:\n");
+        for (const auto & arg : child_args) {
+            SRV_INF("  %s\n", arg.c_str());
+        }
+        inst.meta.args = child_args; // save for debugging
+
+        std::vector<char *> argv = to_char_ptr_array(child_args);
+        std::vector<char *> envp = to_char_ptr_array(child_env);
+
+        // TODO @ngxson : maybe separate stdout and stderr in the future
+        //                so that we can use stdout for commands and stderr for logging
+        int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
+        int result = subprocess_create_ex(argv.data(), options, envp.data(), inst.subproc.get());
+        if (result != 0) {
+            throw std::runtime_error("failed to spawn server instance");
+        }
+
+        inst.stdin_file = subprocess_stdin(inst.subproc.get());
+    }
+
+    // start a thread to manage the child process
+    // captured variables are guaranteed to be destroyed only after the thread is joined
+    inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port, stop_timeout = inst.meta.stop_timeout]() {
+        FILE * stdin_file = subprocess_stdin(child_proc.get());
+        FILE * stdout_file = subprocess_stdout(child_proc.get()); // combined stdout/stderr
+
+        std::thread log_thread([&]() {
+            // read stdout/stderr and forward to main server log
+            // also handle status report from child process
+            bool state_received = false; // true if child state received
+            if (stdout_file) {
+                char buffer[4096];
+                while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
+                    LOG("[%5d] %s", port, buffer);
+                    if (!state_received && std::strstr(buffer, CMD_CHILD_TO_ROUTER_READY) != nullptr) {
+                        // child process is ready
+                        this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
+                        state_received = true;
+                    }
+                }
+            } else {
+                SRV_ERR("failed to get stdout/stderr of child process for name=%s\n", name.c_str());
+            }
+        });
+
+        std::thread stopping_thread([&]() {
+            // thread to monitor stopping signal
+            auto is_stopping = [this, &name]() {
+                return this->stopping_models.find(name) != this->stopping_models.end();
+            };
+            {
+                std::unique_lock<std::mutex> lk(this->mutex);
+                this->cv_stop.wait(lk, is_stopping);
+            }
+            SRV_INF("stopping model instance name=%s\n", name.c_str());
+            // send interrupt to child process
+            fprintf(stdin_file, "%s\n", CMD_ROUTER_TO_CHILD_EXIT);
+            fflush(stdin_file);
+            // wait to stop gracefully or timeout
+            int64_t start_time = ggml_time_ms();
+            while (true) {
+                std::unique_lock<std::mutex> lk(this->mutex);
+                if (!is_stopping()) {
+                    return; // already stopped
+                }
+                int64_t elapsed = ggml_time_ms() - start_time;
+                if (elapsed >= stop_timeout * 1000) {
+                    // timeout, force kill
+                    SRV_WRN("force-killing model instance name=%s after %d seconds timeout\n", name.c_str(), stop_timeout);
+                    subprocess_terminate(child_proc.get());
+                    return;
+                }
+                this->cv_stop.wait_for(lk, std::chrono::seconds(1));
+            }
+        });
+
+        // we reach here when the child process exits
+        // note: we cannot join() prior to this point because it will close stdin_file
+        if (log_thread.joinable()) {
+            log_thread.join();
+        }
+
+        // stop the timeout monitoring thread
+        {
+            std::lock_guard<std::mutex> lk(this->mutex);
+            stopping_models.erase(name);
+            cv_stop.notify_all();
+        }
+        if (stopping_thread.joinable()) {
+            stopping_thread.join();
+        }
+
+        // get the exit code
+        int exit_code = 0;
+        subprocess_join(child_proc.get(), &exit_code);
+        subprocess_destroy(child_proc.get());
+
+        // update status and exit code
+        this->update_status(name, SERVER_MODEL_STATUS_UNLOADED, exit_code);
+        SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code);
+    });
+
+    // clean up old process/thread if exists
+    {
+        auto & old_instance = mapping[name];
+        // old process should have exited already, but just in case, we clean it up here
+        if (subprocess_alive(old_instance.subproc.get())) {
+            SRV_WRN("old process for model name=%s is still alive, this is unexpected\n", name.c_str());
+            subprocess_terminate(old_instance.subproc.get()); // force kill
+        }
+        if (old_instance.th.joinable()) {
+            old_instance.th.join();
+        }
+    }
+
+    mapping[name] = std::move(inst);
+    cv.notify_all();
+}
+
+void server_models::unload(const std::string & name) {
+    std::lock_guard<std::mutex> lk(mutex);
+    auto it = mapping.find(name);
+    if (it != mapping.end()) {
+        if (it->second.meta.is_active()) {
+            SRV_INF("unloading model instance name=%s\n", name.c_str());
+            stopping_models.insert(name);
+            cv_stop.notify_all();
+            // status change will be handled by the managing thread
+        } else {
+            SRV_WRN("model instance name=%s is not loaded\n", name.c_str());
+        }
+    }
+}
+
+void server_models::unload_all() {
+    std::vector<std::thread> to_join;
+    {
+        std::lock_guard<std::mutex> lk(mutex);
+        for (auto & [name, inst] : mapping) {
+            if (inst.meta.is_active()) {
+                SRV_INF("unloading model instance name=%s\n", name.c_str());
+                stopping_models.insert(name);
+                cv_stop.notify_all();
+                // status change will be handled by the managing thread
+            }
+            // moving the thread to join list to avoid deadlock
+            to_join.push_back(std::move(inst.th));
+        }
+    }
+    for (auto & th : to_join) {
+        if (th.joinable()) {
+            th.join();
+        }
+    }
+}
+
+void server_models::update_status(const std::string & name, server_model_status status, int exit_code) {
+    std::unique_lock<std::mutex> lk(mutex);
+    auto it = mapping.find(name);
+    if (it != mapping.end()) {
+        auto & meta = it->second.meta;
+        meta.status    = status;
+        meta.exit_code = exit_code;
+    }
+    cv.notify_all();
+}
+
+void server_models::wait_until_loaded(const std::string & name) {
+    std::unique_lock<std::mutex> lk(mutex);
+    cv.wait(lk, [this, &name]() {
+        auto it = mapping.find(name);
+        if (it != mapping.end()) {
+            return it->second.meta.status != SERVER_MODEL_STATUS_LOADING;
+        }
+        return false;
+    });
+}
+
+bool server_models::ensure_model_loaded(const std::string & name) {
+    auto meta = get_meta(name);
+    if (!meta.has_value()) {
+        throw std::runtime_error("model name=" + name + " is not found");
+    }
+    if (meta->status == SERVER_MODEL_STATUS_LOADED) {
+        return false; // already loaded
+    }
+    if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
+        SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
+        load(name);
+    }
+
+    // for loading state
+    SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
+    wait_until_loaded(name);
+
+    // check final status
+    meta = get_meta(name);
+    if (!meta.has_value() || meta->is_failed()) {
+        throw std::runtime_error("model name=" + name + " failed to load");
+    }
+
+    return true;
+}
+
+server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used) {
+    auto meta = get_meta(name);
+    if (!meta.has_value()) {
+        throw std::runtime_error("model name=" + name + " is not found");
+    }
+    if (meta->status != SERVER_MODEL_STATUS_LOADED) {
+        throw std::invalid_argument("model name=" + name + " is not loaded");
+    }
+    if (update_last_used) {
+        std::unique_lock<std::mutex> lk(mutex);
+        mapping[name].meta.last_used = ggml_time_ms();
+    }
+    SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
+    auto proxy = std::make_unique<server_http_proxy>(
+            method,
+            CHILD_ADDR,
+            meta->port,
+            req.path,
+            req.headers,
+            req.body,
+            req.should_stop,
+            base_params.timeout_read,
+            base_params.timeout_write
+            );
+    return proxy;
+}
+
+std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler) {
+    // send a notification to the router server that a model instance is ready
+    common_log_pause(common_log_main());
+    fflush(stdout);
+    fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
+    fflush(stdout);
+    common_log_resume(common_log_main());
+
+    // setup thread for monitoring stdin
+    return std::thread([shutdown_handler]() {
+        // wait for EOF on stdin
+        SRV_INF("%s", "child server monitoring thread started, waiting for EOF on stdin...\n");
+        bool eof = false;
+        while (true) {
+            std::string line;
+            if (!std::getline(std::cin, line)) {
+                // EOF detected, that means the router server is unexpectedly exit or killed
+                eof = true;
+                break;
+            }
+            if (line.find(CMD_ROUTER_TO_CHILD_EXIT) != std::string::npos) {
+                SRV_INF("%s", "exit command received, exiting...\n");
+                shutdown_handler(0);
+                break;
+            }
+        }
+        if (eof) {
+            SRV_INF("%s", "EOF on stdin detected, forcing shutdown...\n");
+            exit(1);
+        }
+    });
+}
+
+
+
+//
+// server_models_routes
+//
+
+static void res_ok(std::unique_ptr<server_http_res> & res, const json & response_data) {
+    res->status = 200;
+    res->data = safe_json_to_str(response_data);
+}
+
+static void res_err(std::unique_ptr<server_http_res> & res, const json & error_data) {
+    res->status = json_value(error_data, "code", 500);
+    res->data = safe_json_to_str({{ "error", error_data }});
+}
+
+static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
+    if (name.empty()) {
+        res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
+        return false;
+    }
+    auto meta = models.get_meta(name);
+    if (!meta.has_value()) {
+        res_err(res, format_error_response(string_format("model '%s' not found", name.c_str()), ERROR_TYPE_INVALID_REQUEST));
+        return false;
+    }
+    if (models_autoload) {
+        models.ensure_model_loaded(name);
+    } else {
+        if (meta->status != SERVER_MODEL_STATUS_LOADED) {
+            res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool is_autoload(const common_params & params, const server_http_req & req) {
+    std::string autoload = req.get_param("autoload");
+    if (autoload.empty()) {
+        return params.models_autoload;
+    } else {
+        return autoload == "true" || autoload == "1";
+    }
+}
+
+void server_models_routes::init_routes() {
+    this->get_router_props = [this](const server_http_req & req) {
+        std::string name = req.get_param("model");
+        if (name.empty()) {
+            // main instance
+            auto res = std::make_unique<server_http_res>();
+            res_ok(res, {
+                // TODO: add support for this on web UI
+                {"role",          "router"},
+                {"max_instances", 4}, // dummy value for testing
+                // this is a dummy response to make sure webui doesn't break
+                {"model_alias", "llama-server"},
+                {"model_path",  "none"},
+                {"default_generation_settings", {
+                    {"params", json{}},
+                    {"n_ctx",  0},
+                }},
+                {"webui_settings", webui_settings},
+            });
+            return res;
+        }
+        return proxy_get(req);
+    };
+
+    this->proxy_get = [this](const server_http_req & req) {
+        std::string method = "GET";
+        std::string name = req.get_param("model");
+        bool autoload = is_autoload(params, req);
+        auto error_res = std::make_unique<server_http_res>();
+        if (!router_validate_model(name, models, autoload, error_res)) {
+            return error_res;
+        }
+        return models.proxy_request(req, method, name, false);
+    };
+
+    this->proxy_post = [this](const server_http_req & req) {
+        std::string method = "POST";
+        json body = json::parse(req.body);
+        std::string name = json_value(body, "model", std::string());
+        bool autoload = is_autoload(params, req);
+        auto error_res = std::make_unique<server_http_res>();
+        if (!router_validate_model(name, models, autoload, error_res)) {
+            return error_res;
+        }
+        return models.proxy_request(req, method, name, true); // update last usage for POST request only
+    };
+
+    this->post_router_models_load = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_http_res>();
+        json body = json::parse(req.body);
+        std::string name = json_value(body, "model", std::string());
+        auto model = models.get_meta(name);
+        if (!model.has_value()) {
+            res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
+            return res;
+        }
+        if (model->status == SERVER_MODEL_STATUS_LOADED) {
+            res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        models.load(name);
+        res_ok(res, {{"success", true}});
+        return res;
+    };
+
+    this->get_router_models = [this](const server_http_req &) {
+        auto res = std::make_unique<server_http_res>();
+        json models_json = json::array();
+        auto all_models = models.get_all_meta();
+        std::time_t t = std::time(0);
+        for (const auto & meta : all_models) {
+            json status {
+                {"value",  server_model_status_to_string(meta.status)},
+                {"args",   meta.args},
+            };
+            if (!meta.preset.name.empty()) {
+                common_preset preset_copy = meta.preset;
+                unset_reserved_args(preset_copy, false);
+                preset_copy.unset_option("LLAMA_ARG_HOST");
+                preset_copy.unset_option("LLAMA_ARG_PORT");
+                preset_copy.unset_option("LLAMA_ARG_ALIAS");
+                status["preset"] = preset_copy.to_ini();
+            }
+            if (meta.is_failed()) {
+                status["exit_code"] = meta.exit_code;
+                status["failed"]    = true;
+            }
+            models_json.push_back(json {
+                {"id",       meta.name},
+                {"object",   "model"},    // for OAI-compat
+                {"owned_by", "llamacpp"}, // for OAI-compat
+                {"created",  t},          // for OAI-compat
+                {"status",   status},
+                // TODO: add other fields, may require reading GGUF metadata
+            });
+        }
+        res_ok(res, {
+            {"data", models_json},
+            {"object", "list"},
+        });
+        return res;
+    };
+
+    this->post_router_models_unload = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_http_res>();
+        json body = json::parse(req.body);
+        std::string name = json_value(body, "model", std::string());
+        auto model = models.get_meta(name);
+        if (!model.has_value()) {
+            res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        if (!model->is_active()) {
+            res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        models.unload(name);
+        res_ok(res, {{"success", true}});
+        return res;
+    };
+}
+
+
+
+//
+// server_http_proxy
+//
+
+// simple implementation of a pipe
+// used for streaming data between threads
+template<typename T>
+struct pipe_t {
+    std::mutex mutex;
+    std::condition_variable cv;
+    std::queue<T> queue;
+    std::atomic<bool> writer_closed{false};
+    std::atomic<bool> reader_closed{false};
+    void close_write() {
+        writer_closed.store(true, std::memory_order_relaxed);
+        cv.notify_all();
+    }
+    void close_read() {
+        reader_closed.store(true, std::memory_order_relaxed);
+        cv.notify_all();
+    }
+    bool read(T & output, const std::function<bool()> & should_stop) {
+        std::unique_lock<std::mutex> lk(mutex);
+        constexpr auto poll_interval = std::chrono::milliseconds(500);
+        while (true) {
+            if (!queue.empty()) {
+                output = std::move(queue.front());
+                queue.pop();
+                return true;
+            }
+            if (writer_closed.load()) {
+                return false; // clean EOF
+            }
+            if (should_stop()) {
+                close_read(); // signal broken pipe to writer
+                return false; // cancelled / reader no longer alive
+            }
+            cv.wait_for(lk, poll_interval);
+        }
+    }
+    bool write(T && data) {
+        std::lock_guard<std::mutex> lk(mutex);
+        if (reader_closed.load()) {
+            return false; // broken pipe
+        }
+        queue.push(std::move(data));
+        cv.notify_one();
+        return true;
+    }
+};
+
+static std::string to_lower_copy(const std::string & value) {
+    std::string lowered(value.size(), '\0');
+    std::transform(value.begin(), value.end(), lowered.begin(), [](unsigned char c) { return std::tolower(c); });
+    return lowered;
+}
+
+static bool should_strip_proxy_header(const std::string & header_name) {
+    // Headers that get duplicated when router forwards child responses
+    if (header_name == "server" ||
+        header_name == "transfer-encoding" ||
+        header_name == "content-length" || // quick fix for https://github.com/ggml-org/llama.cpp/issues/17710
+        header_name == "keep-alive") {
+        return true;
+    }
+
+    // Router injects CORS, child also sends them: duplicate
+    if (header_name.rfind("access-control-", 0) == 0) {
+        return true;
+    }
+
+    return false;
+}
+
+server_http_proxy::server_http_proxy(
+        const std::string & method,
+        const std::string & host,
+        int port,
+        const std::string & path,
+        const std::map<std::string, std::string> & headers,
+        const std::string & body,
+        const std::function<bool()> should_stop,
+        int32_t timeout_read,
+        int32_t timeout_write
+        ) {
+    // shared between reader and writer threads
+    auto cli  = std::make_shared<httplib::Client>(host, port);
+    auto pipe = std::make_shared<pipe_t<msg_t>>();
+
+    // setup Client
+    cli->set_connection_timeout(0, 200000); // 200 milliseconds
+    cli->set_write_timeout(timeout_read, 0); // reversed for cli (client) vs srv (server)
+    cli->set_read_timeout(timeout_write, 0);
+    this->status = 500; // to be overwritten upon response
+    this->cleanup = [pipe]() {
+        pipe->close_read();
+        pipe->close_write();
+    };
+
+    // wire up the receive end of the pipe
+    this->next = [pipe, should_stop](std::string & out) -> bool {
+        msg_t msg;
+        bool has_next = pipe->read(msg, should_stop);
+        if (!msg.data.empty()) {
+            out = std::move(msg.data);
+        }
+        return has_next; // false if EOF or pipe broken
+    };
+
+    // wire up the HTTP client
+    // note: do NOT capture `this` pointer, as it may be destroyed before the thread ends
+    httplib::ResponseHandler response_handler = [pipe, cli](const httplib::Response & response) {
+        msg_t msg;
+        msg.status = response.status;
+        for (const auto & [key, value] : response.headers) {
+            const auto lowered = to_lower_copy(key);
+            if (should_strip_proxy_header(lowered)) {
+                continue;
+            }
+            if (lowered == "content-type") {
+                msg.content_type = value;
+                continue;
+            }
+            msg.headers[key] = value;
+        }
+        return pipe->write(std::move(msg)); // send headers first
+    };
+    httplib::ContentReceiverWithProgress content_receiver = [pipe](const char * data, size_t data_length, size_t, size_t) {
+        // send data chunks
+        // returns false if pipe is closed / broken (signal to stop receiving)
+        return pipe->write({{}, 0, std::string(data, data_length), ""});
+    };
+
+    // prepare the request to destination server
+    httplib::Request req;
+    {
+        req.method = method;
+        req.path = path;
+        for (const auto & [key, value] : headers) {
+            req.set_header(key, value);
+        }
+        req.body = body;
+        req.response_handler = response_handler;
+        req.content_receiver = content_receiver;
+    }
+
+    // start the proxy thread
+    SRV_DBG("start proxy thread %s %s\n", req.method.c_str(), req.path.c_str());
+    this->thread = std::thread([cli, pipe, req]() {
+        auto result = cli->send(std::move(req));
+        if (result.error() != httplib::Error::Success) {
+            auto err_str = httplib::to_string(result.error());
+            SRV_ERR("http client error: %s\n", err_str.c_str());
+            pipe->write({{}, 500, "", ""}); // header
+            pipe->write({{}, 0, "proxy error: " + err_str, ""}); // body
+        }
+        pipe->close_write(); // signal EOF to reader
+        SRV_DBG("%s", "client request thread ended\n");
+    });
+    this->thread.detach();
+
+    // wait for the first chunk (headers)
+    {
+        msg_t header;
+        if (pipe->read(header, should_stop)) {
+            SRV_DBG("%s", "received response headers\n");
+            this->status  = header.status;
+            this->headers = std::move(header.headers);
+            if (!header.content_type.empty()) {
+                this->content_type = std::move(header.content_type);
+            }
+        } else {
+            SRV_DBG("%s", "no response headers received (request cancelled?)\n");
+        }
+    }
+}
diff --git a/llama.cpp/tools/server/server-models.h b/llama.cpp/tools/server/server-models.h
new file mode 100644
index 0000000..a397abd
--- /dev/null
+++ b/llama.cpp/tools/server/server-models.h
@@ -0,0 +1,203 @@
+#pragma once
+
+#include "common.h"
+#include "preset.h"
+#include "server-common.h"
+#include "server-http.h"
+
+#include <mutex>
+#include <condition_variable>
+#include <functional>
+#include <memory>
+#include <set>
+
+/**
+ * state diagram:
+ *
+ * UNLOADED ──► LOADING ──► LOADED
+ *  ▲            │            │
+ *  └───failed───┘            │
+ *  ▲                         │
+ *  └────────unloaded─────────┘
+ */
+enum server_model_status {
+    // TODO: also add downloading state when the logic is added
+    SERVER_MODEL_STATUS_UNLOADED,
+    SERVER_MODEL_STATUS_LOADING,
+    SERVER_MODEL_STATUS_LOADED
+};
+
+static server_model_status server_model_status_from_string(const std::string & status_str) {
+    if (status_str == "unloaded") {
+        return SERVER_MODEL_STATUS_UNLOADED;
+    }
+    if (status_str == "loading") {
+        return SERVER_MODEL_STATUS_LOADING;
+    }
+    if (status_str == "loaded") {
+        return SERVER_MODEL_STATUS_LOADED;
+    }
+    throw std::runtime_error("invalid server model status");
+}
+
+static std::string server_model_status_to_string(server_model_status status) {
+    switch (status) {
+        case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
+        case SERVER_MODEL_STATUS_LOADING:  return "loading";
+        case SERVER_MODEL_STATUS_LOADED:   return "loaded";
+        default:                           return "unknown";
+    }
+}
+
+struct server_model_meta {
+    common_preset preset;
+    std::string name;
+    int port = 0;
+    server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
+    int64_t last_used = 0; // for LRU unloading
+    std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
+    int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
+    int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
+
+    bool is_active() const {
+        return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
+    }
+
+    bool is_failed() const {
+        return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0;
+    }
+
+    void update_args(common_preset_context & ctx_presets, std::string bin_path);
+};
+
+struct subprocess_s;
+
+struct server_models {
+private:
+    struct instance_t {
+        std::shared_ptr<subprocess_s> subproc; // shared between main thread and monitoring thread
+        std::thread th;
+        server_model_meta meta;
+        FILE * stdin_file = nullptr;
+    };
+
+    std::mutex mutex;
+    std::condition_variable cv;
+    std::map<std::string, instance_t> mapping;
+
+    // for stopping models
+    std::condition_variable cv_stop;
+    std::set<std::string> stopping_models;
+
+    common_preset_context ctx_preset;
+
+    common_params base_params;
+    std::string bin_path;
+    std::vector<std::string> base_env;
+    common_preset base_preset; // base preset from llama-server CLI args
+
+    void update_meta(const std::string & name, const server_model_meta & meta);
+
+    // unload least recently used models if the limit is reached
+    void unload_lru();
+
+    // not thread-safe, caller must hold mutex
+    void add_model(server_model_meta && meta);
+
+public:
+    server_models(const common_params & params, int argc, char ** argv);
+
+    void load_models();
+
+    // check if a model instance exists (thread-safe)
+    bool has_model(const std::string & name);
+
+    // return a copy of model metadata (thread-safe)
+    std::optional<server_model_meta> get_meta(const std::string & name);
+
+    // return a copy of all model metadata (thread-safe)
+    std::vector<server_model_meta> get_all_meta();
+
+    // load and unload model instances
+    // these functions are thread-safe
+    void load(const std::string & name);
+    void unload(const std::string & name);
+    void unload_all();
+
+    // update the status of a model instance (thread-safe)
+    void update_status(const std::string & name, server_model_status status, int exit_code);
+
+    // wait until the model instance is fully loaded (thread-safe)
+    // return when the model is loaded or failed to load
+    void wait_until_loaded(const std::string & name);
+
+    // load the model if not loaded, otherwise do nothing (thread-safe)
+    // return false if model is already loaded; return true otherwise (meta may need to be refreshed)
+    bool ensure_model_loaded(const std::string & name);
+
+    // proxy an HTTP request to the model instance
+    server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
+
+    // notify the router server that a model instance is ready
+    // return the monitoring thread (to be joined by the caller)
+    static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
+};
+
+struct server_models_routes {
+    common_params params;
+    json webui_settings = json::object();
+    server_models models;
+    server_models_routes(const common_params & params, int argc, char ** argv)
+            : params(params), models(params, argc, argv) {
+        if (!this->params.webui_config_json.empty()) {
+            try {
+                webui_settings = json::parse(this->params.webui_config_json);
+            } catch (const std::exception & e) {
+                LOG_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
+                throw;
+            }
+        }
+        init_routes();
+    }
+
+    void init_routes();
+    // handlers using lambda function, so that they can capture `this` without `std::bind`
+    server_http_context::handler_t get_router_props;
+    server_http_context::handler_t proxy_get;
+    server_http_context::handler_t proxy_post;
+    server_http_context::handler_t get_router_models;
+    server_http_context::handler_t post_router_models_load;
+    server_http_context::handler_t post_router_models_unload;
+};
+
+/**
+ * A simple HTTP proxy that forwards requests to another server
+ * and relays the responses back.
+ */
+struct server_http_proxy : server_http_res {
+    std::function<void()> cleanup = nullptr;
+public:
+    server_http_proxy(const std::string & method,
+                      const std::string & host,
+                      int port,
+                      const std::string & path,
+                      const std::map<std::string, std::string> & headers,
+                      const std::string & body,
+                      const std::function<bool()> should_stop,
+                      int32_t timeout_read,
+                      int32_t timeout_write
+                      );
+    ~server_http_proxy() {
+        if (cleanup) {
+            cleanup();
+        }
+    }
+private:
+    std::thread thread;
+    struct msg_t {
+        std::map<std::string, std::string> headers;
+        int status = 0;
+        std::string data;
+        std::string content_type;
+    };
+};
diff --git a/llama.cpp/tools/server/server-queue.cpp b/llama.cpp/tools/server/server-queue.cpp
new file mode 100644
index 0000000..a2a026a
--- /dev/null
+++ b/llama.cpp/tools/server/server-queue.cpp
@@ -0,0 +1,450 @@
+#include "server-task.h"
+#include "server-queue.h"
+
+#include "log.h"
+
+#include <chrono>
+
+#define QUE_INF(fmt, ...) LOG_INF("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_WRN(fmt, ...) LOG_WRN("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+#define RES_INF(fmt, ...) LOG_INF("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define RES_WRN(fmt, ...) LOG_WRN("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define RES_ERR(fmt, ...) LOG_ERR("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define RES_DBG(fmt, ...) LOG_DBG("res  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+//
+// server_queue
+//
+
+int server_queue::post(server_task && task, bool front) {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    GGML_ASSERT(task.id != -1);
+    // if this is cancel task make sure to clean up pending tasks
+    if (task.type == SERVER_TASK_TYPE_CANCEL) {
+        cleanup_pending_task(task.id_target);
+    }
+    const int task_id = task.id;
+    QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
+    if (front) {
+        queue_tasks.push_front(std::move(task));
+    } else {
+        queue_tasks.push_back(std::move(task));
+    }
+    time_last_task = ggml_time_ms();
+    condition_tasks.notify_one();
+    return task_id;
+}
+
+int server_queue::post(std::vector<server_task> && tasks, bool front) {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    for (auto & task : tasks) {
+        if (task.id == -1) {
+            task.id = id++;
+        }
+        // if this is cancel task make sure to clean up pending tasks
+        if (task.type == SERVER_TASK_TYPE_CANCEL) {
+            cleanup_pending_task(task.id_target);
+        }
+        QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
+        if (front) {
+            queue_tasks.push_front(std::move(task));
+        } else {
+            queue_tasks.push_back(std::move(task));
+        }
+    }
+    time_last_task = ggml_time_ms();
+    condition_tasks.notify_one();
+    return 0;
+}
+
+void server_queue::defer(server_task && task) {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    QUE_DBG("defer task, id = %d\n", task.id);
+    queue_tasks_deferred.push_back(std::move(task));
+    time_last_task = ggml_time_ms();
+    condition_tasks.notify_one();
+}
+
+int server_queue::get_new_id() {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    int new_id = id++;
+    return new_id;
+}
+
+void server_queue::pop_deferred_task(int id_slot) {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    if (!queue_tasks_deferred.empty()) {
+        // try to find a task that uses the specified slot
+        bool found = false;
+        for (auto it = queue_tasks_deferred.begin(); it != queue_tasks_deferred.end(); ++it) {
+            if (it->id_slot == id_slot) {
+                QUE_DBG("pop deferred task (use slot %d), id_task = %d\n", id_slot, it->id);
+                queue_tasks.emplace_front(std::move(*it));
+                queue_tasks_deferred.erase(it);
+                found = true;
+                break;
+            }
+        }
+        // if not tasks found using the slot, just pop the first deferred task (default behavior)
+        if (!found) {
+            QUE_DBG("pop deferred task, id_task = %d\n", queue_tasks_deferred.front().id);
+            queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
+            queue_tasks_deferred.pop_front();
+        }
+    }
+    time_last_task = ggml_time_ms();
+    condition_tasks.notify_one();
+}
+
+void server_queue::wait_until_no_sleep() {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    if (!sleeping) {
+        return;
+    } else {
+        if (!req_stop_sleeping) {
+            QUE_DBG("%s", "requesting to stop sleeping\n");
+            req_stop_sleeping = true;
+            condition_tasks.notify_one(); // only main thread is waiting on this
+        }
+        QUE_DBG("%s", "waiting until no sleep\n");
+        condition_tasks.wait(lock, [&]{
+            return !sleeping;
+        });
+    }
+}
+
+void server_queue::terminate() {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    running = false;
+    condition_tasks.notify_all();
+}
+
+void server_queue::start_loop(int64_t idle_sleep_ms) {
+    running = true;
+    time_last_task = ggml_time_ms();
+
+    constexpr auto max_wait_time = std::chrono::seconds(1);
+    auto should_sleep = [&]() -> bool {
+        // caller must hold mutex_tasks
+        if (idle_sleep_ms < 0) {
+            return false;
+        }
+        int64_t now = ggml_time_ms();
+        return (now - time_last_task) >= idle_sleep_ms;
+    };
+
+    while (true) {
+        QUE_DBG("%s", "processing new tasks\n");
+
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            if (!running) {
+                QUE_DBG("%s", "terminate\n");
+                return;
+            }
+            if (queue_tasks.empty()) {
+                lock.unlock();
+                break;
+            }
+            server_task task = std::move(queue_tasks.front());
+            queue_tasks.pop_front();
+            lock.unlock();
+
+            QUE_DBG("processing task, id = %d\n", task.id);
+            callback_new_task(std::move(task));
+        }
+        // all tasks in the current loop is processed, slots data is now ready
+        QUE_DBG("%s", "update slots\n");
+
+        // this will run the main inference process for all slots
+        callback_update_slots();
+        {
+            // update_slots() may take a while to finish, we need to make sure it's not counted as idle
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            time_last_task = ggml_time_ms();
+        }
+
+        QUE_DBG("%s", "waiting for new tasks\n");
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            if (!running || !queue_tasks.empty()) {
+                break; // go back to process new tasks or terminate
+            }
+
+            // no tasks, check for sleeping state
+            if (should_sleep()) {
+                QUE_INF("%s", "entering sleeping state\n");
+                sleeping = true;
+                callback_sleeping_state(true);
+                req_stop_sleeping = false;
+                // wait until we are requested to exit sleeping state
+                condition_tasks.wait(lock, [&]{
+                    return (!running || req_stop_sleeping);
+                });
+                if (!running) { // may changed during sleep
+                    break; // terminate
+                }
+                QUE_INF("%s", "exiting sleeping state\n");
+                req_stop_sleeping = false;
+                callback_sleeping_state(false);
+                sleeping = false;
+                time_last_task = ggml_time_ms();
+                condition_tasks.notify_all(); // notify wait_until_no_sleep()
+                break; // process new tasks
+            } else {
+                // wait for new tasks or timeout for checking sleeping condition
+                bool res = condition_tasks.wait_for(lock, max_wait_time, [&]{
+                    return (!queue_tasks.empty() || !running);
+                });
+                if (res) {
+                    break; // new task arrived or terminate
+                }
+                // otherwise, loop again to check sleeping condition
+            }
+        }
+    }
+}
+
+void server_queue::cleanup_pending_task(int id_target) {
+    // no need lock because this is called exclusively by post()
+    auto rm_func = [id_target](const server_task & task) {
+        return task.id == id_target;
+    };
+    queue_tasks.erase(
+        std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
+        queue_tasks.end());
+    queue_tasks_deferred.erase(
+        std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
+        queue_tasks_deferred.end());
+}
+
+//
+// server_response
+//
+
+void server_response::add_waiting_task_id(int id_task) {
+    RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
+
+    std::unique_lock<std::mutex> lock(mutex_results);
+    waiting_task_ids.insert(id_task);
+}
+
+void server_response::add_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
+    std::unique_lock<std::mutex> lock(mutex_results);
+
+    for (const auto & id_task : id_tasks) {
+        RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
+        waiting_task_ids.insert(id_task);
+    }
+}
+
+void server_response::remove_waiting_task_id(int id_task) {
+    RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
+
+    std::unique_lock<std::mutex> lock(mutex_results);
+    waiting_task_ids.erase(id_task);
+    // make sure to clean up all pending results
+    queue_results.erase(
+        std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
+            return res->id == id_task;
+        }),
+        queue_results.end());
+}
+
+void server_response::remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
+    std::unique_lock<std::mutex> lock(mutex_results);
+
+    for (const auto & id_task : id_tasks) {
+        RES_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
+        waiting_task_ids.erase(id_task);
+    }
+}
+
+server_task_result_ptr server_response::recv(const std::unordered_set<int> & id_tasks) {
+    while (true) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        condition_results.wait(lock, [&]{
+            if (!running) {
+                RES_DBG("%s : queue result stop\n", "recv");
+                std::terminate(); // we cannot return here since the caller is HTTP code
+            }
+            return !queue_results.empty();
+        });
+
+        for (size_t i = 0; i < queue_results.size(); i++) {
+            if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
+                server_task_result_ptr res = std::move(queue_results[i]);
+                queue_results.erase(queue_results.begin() + i);
+                return res;
+            }
+        }
+    }
+
+    // should never reach here
+}
+
+server_task_result_ptr server_response::recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
+    while (true) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+
+        for (int i = 0; i < (int) queue_results.size(); i++) {
+            if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
+                server_task_result_ptr res = std::move(queue_results[i]);
+                queue_results.erase(queue_results.begin() + i);
+                return res;
+            }
+        }
+
+        std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
+        if (!running) {
+            RES_DBG("%s : queue result stop\n", __func__);
+            std::terminate(); // we cannot return here since the caller is HTTP code
+        }
+        if (cr_res == std::cv_status::timeout) {
+            return nullptr;
+        }
+    }
+
+    // should never reach here
+}
+
+server_task_result_ptr server_response::recv(int id_task) {
+    std::unordered_set<int> id_tasks = {id_task};
+    return recv(id_tasks);
+}
+
+void server_response::send(server_task_result_ptr && result) {
+    RES_DBG("sending result for task id = %d\n", result->id);
+
+    std::unique_lock<std::mutex> lock(mutex_results);
+    for (const auto & id_task : waiting_task_ids) {
+        if (result->id == id_task) {
+            RES_DBG("task id = %d pushed to result queue\n", result->id);
+
+            queue_results.emplace_back(std::move(result));
+            condition_results.notify_all();
+            return;
+        }
+    }
+}
+
+void server_response::terminate() {
+    running = false;
+    condition_results.notify_all();
+}
+
+//
+// server_response_reader
+//
+
+void server_response_reader::post_task(server_task && task, bool front) {
+    GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader");
+    GGML_ASSERT(!task.is_parent() && "not supported, use post_tasks() instead");
+    task.index = 0;
+    id_tasks.insert(task.id);
+    states.push_back(task.create_state());
+    queue_results.add_waiting_task_id(task.id);
+    queue_tasks.post(std::move(task), front);
+}
+
+void server_response_reader::post_tasks(std::vector<server_task> && tasks, bool front) {
+    GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader");
+    id_tasks = server_task::get_list_id(tasks);
+    states.reserve(tasks.size());
+    size_t index = 0;
+    for (auto & task : tasks) {
+        task.index = index++;
+        states.push_back(task.create_state());
+        // for child tasks
+        for (auto & child_task : task.child_tasks) {
+            child_task.index = index++;
+            states.push_back(child_task.create_state());
+        }
+    }
+    GGML_ASSERT(states.size() == id_tasks.size());
+    queue_results.add_waiting_task_ids(id_tasks);
+    queue_tasks.post(std::move(tasks), front);
+}
+
+bool server_response_reader::has_next() const {
+    return !cancelled && received_count < id_tasks.size();
+}
+
+// return nullptr if should_stop() is true before receiving a result
+// note: if one error is received, it will stop further processing and return error result
+server_task_result_ptr server_response_reader::next(const std::function<bool()> & should_stop) {
+    while (true) {
+        server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, polling_interval_seconds);
+        if (result == nullptr) {
+            // timeout, check stop condition
+            if (should_stop()) {
+                SRV_DBG("%s", "stopping wait for next result due to should_stop condition\n");
+                return nullptr;
+            }
+        } else {
+            if (result->is_error()) {
+                stop(); // cancel remaining tasks
+                SRV_DBG("%s", "received error result, stopping further processing\n");
+                return result;
+            }
+            if (!states.empty()) {
+                // update the generation state if needed
+                const size_t idx = result->index;
+                GGML_ASSERT(idx < states.size());
+                result->update(states[idx]);
+            }
+            if (result->is_stop()) {
+                received_count++;
+            }
+            return result;
+        }
+    }
+
+    // should not reach here
+}
+
+server_response_reader::batch_response server_response_reader::wait_for_all(const std::function<bool()> & should_stop) {
+    batch_response batch_res;
+    batch_res.results.clear();
+    batch_res.results.resize(id_tasks.size());
+    while (has_next()) {
+        auto res = next(should_stop);
+        if (res == nullptr) {
+            batch_res.is_terminated = true;
+            return batch_res;
+        }
+        if (res->is_error()) {
+            batch_res.error = std::move(res);
+            return batch_res;
+        }
+        const size_t idx = res->index;
+        GGML_ASSERT(idx < batch_res.results.size() && "index out of range");
+        GGML_ASSERT(batch_res.results[idx] == nullptr && "duplicate result received");
+        batch_res.results[idx] = std::move(res);
+    }
+    return batch_res;
+}
+
+void server_response_reader::stop() {
+    queue_results.remove_waiting_task_ids(id_tasks);
+    if (has_next() && !cancelled) {
+        // if tasks is not finished yet, cancel them
+        cancelled = true;
+        std::vector<server_task> cancel_tasks;
+        cancel_tasks.reserve(id_tasks.size());
+        for (const auto & id_task : id_tasks) {
+            SRV_WRN("cancel task, id_task = %d\n", id_task);
+            server_task task(SERVER_TASK_TYPE_CANCEL);
+            task.id_target = id_task;
+            queue_results.remove_waiting_task_id(id_task);
+            cancel_tasks.push_back(std::move(task));
+        }
+        // push to beginning of the queue, so it has highest priority
+        queue_tasks.post(std::move(cancel_tasks), true);
+    } else {
+        SRV_DBG("%s", "all tasks already finished, no need to cancel\n");
+    }
+}
diff --git a/llama.cpp/tools/server/server-queue.h b/llama.cpp/tools/server/server-queue.h
new file mode 100644
index 0000000..164f09b
--- /dev/null
+++ b/llama.cpp/tools/server/server-queue.h
@@ -0,0 +1,197 @@
+#pragma once
+
+#include "server-task.h"
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <vector>
+#include <unordered_set>
+
+// struct for managing server tasks
+// in most cases, use server_response_reader to post new tasks and retrieve results
+struct server_queue {
+private:
+    int id = 0;
+    bool running  = false;
+    bool sleeping = false;
+    bool req_stop_sleeping = false;
+    int64_t time_last_task = 0;
+
+    // queues
+    std::deque<server_task> queue_tasks;
+    std::deque<server_task> queue_tasks_deferred;
+
+    std::mutex mutex_tasks;
+    std::condition_variable condition_tasks;
+
+    // callback functions
+    std::function<void(server_task &&)> callback_new_task;
+    std::function<void(void)>           callback_update_slots;
+    std::function<void(bool)>           callback_sleeping_state;
+
+public:
+    // Add a new task to the end of the queue
+    int post(server_task && task, bool front = false);
+
+    // multi-task version of post()
+    int post(std::vector<server_task> && tasks, bool front = false);
+
+    // Add a new task, but defer until one slot is available
+    void defer(server_task && task);
+
+    // Get the next id for creating a new task
+    int get_new_id();
+
+    // Call when the state of one slot is changed, it will move one task from deferred to main queue
+    // prioritize tasks that use the specified slot (otherwise, pop the first deferred task)
+    void pop_deferred_task(int id_slot);
+
+    // if sleeping, request exiting sleep state and wait until it is done
+    // returns immediately if not sleeping
+    void wait_until_no_sleep();
+
+    bool is_sleeping() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        return sleeping;
+    }
+
+    // end the start_loop routine
+    void terminate();
+
+    /**
+     * Main loop consists of these steps:
+     * - Wait until a new task arrives
+     * - Process the task (i.e. maybe copy data into slot)
+     * - Check if multitask is finished
+     * - Update all slots
+     *
+     * Sleeping procedure (disabled if idle_sleep_ms < 0):
+     * - If there is no task after idle_sleep_ms, enter sleeping state
+     * - Call callback_sleeping_state(true)
+     * - Wait until req_stop_sleeping is set to true
+     * - Call callback_sleeping_state(false)
+     * - Exit sleeping state
+     */
+    void start_loop(int64_t idle_sleep_ms = -1);
+
+    // for metrics
+    size_t queue_tasks_deferred_size() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        return queue_tasks_deferred.size();
+    }
+
+    //
+    // Functions below are not thread-safe, must only be used before start_loop() is called
+    //
+
+    // Register function to process a new task
+    void on_new_task(std::function<void(server_task &&)> callback) {
+        callback_new_task = std::move(callback);
+    }
+
+    // Register the function to be called when all slots data is ready to be processed
+    void on_update_slots(std::function<void(void)> callback) {
+        callback_update_slots = std::move(callback);
+    }
+
+    // Register callback for sleeping state change
+    // note: when entering sleeping state, the callback is called AFTER sleeping is set to true
+    //       when leaving sleeping state, the callback is called BEFORE sleeping is set to false
+    void on_sleeping_state(std::function<void(bool)> callback) {
+        callback_sleeping_state = std::move(callback);
+    }
+
+private:
+    void cleanup_pending_task(int id_target);
+};
+
+// struct for managing server responses
+// in most cases, use server_response_reader to retrieve results
+struct server_response {
+private:
+    bool running = true;
+
+    // for keeping track of all tasks waiting for the result
+    std::unordered_set<int> waiting_task_ids;
+
+    // the main result queue (using ptr for polymorphism)
+    std::vector<server_task_result_ptr> queue_results;
+
+    std::mutex mutex_results;
+    std::condition_variable condition_results;
+
+public:
+    // add the id_task to the list of tasks waiting for response
+    void add_waiting_task_id(int id_task);
+
+    void add_waiting_task_ids(const std::unordered_set<int> & id_tasks);
+
+    // when the request is finished, we can remove task associated with it
+    void remove_waiting_task_id(int id_task);
+
+    // remove multiple tasks from waiting list
+    void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks);
+
+    // This function blocks the thread until there is a response for one of the id_tasks
+    server_task_result_ptr recv(const std::unordered_set<int> & id_tasks);
+
+    // same as recv(), but have timeout in seconds
+    // if timeout is reached, nullptr is returned
+    server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout);
+
+    // single-task version of recv()
+    server_task_result_ptr recv(int id_task);
+
+    // Send a new result to a waiting id_task
+    void send(server_task_result_ptr && result);
+
+    // terminate the waiting loop
+    void terminate();
+};
+
+// utility class to make working with server_queue and server_response easier
+// it provides a generator-like API for server responses
+// support pooling connection state and aggregating multiple results
+struct server_response_reader {
+    std::unordered_set<int> id_tasks;
+    server_queue & queue_tasks;
+    server_response & queue_results;
+    size_t received_count = 0;
+    bool cancelled = false;
+    int polling_interval_seconds;
+
+    // tracking generation state and partial tool calls
+    // only used by streaming completions
+    std::vector<task_result_state> states;
+
+    // should_stop function will be called each polling_interval_seconds
+    server_response_reader(server_queue & queue_tasks, server_response & queue_results, int polling_interval_seconds)
+        : queue_tasks(queue_tasks), queue_results(queue_results), polling_interval_seconds(polling_interval_seconds) {}
+    ~server_response_reader() {
+        stop();
+    }
+
+    int get_new_id() {
+        return queue_tasks.get_new_id();
+    }
+
+    // if front = true, the task will be posted to the front of the queue (high priority)
+    void post_task(server_task && task, bool front = false);
+    void post_tasks(std::vector<server_task> && tasks, bool front = false);
+    bool has_next() const;
+
+    // return nullptr if should_stop() is true before receiving a result
+    // note: if one error is received, it will stop further processing and return error result
+    server_task_result_ptr next(const std::function<bool()> & should_stop);
+
+    struct batch_response {
+        bool is_terminated = false; // if true, indicates that processing was stopped before all results were received
+        std::vector<server_task_result_ptr> results;
+        server_task_result_ptr error; // nullptr if no error
+    };
+    // aggregate multiple results
+    batch_response wait_for_all(const std::function<bool()> & should_stop);
+
+    void stop();
+};
diff --git a/llama.cpp/tools/server/server-task.cpp b/llama.cpp/tools/server/server-task.cpp
new file mode 100644
index 0000000..a137427
--- /dev/null
+++ b/llama.cpp/tools/server/server-task.cpp
@@ -0,0 +1,2005 @@
+#include "server-common.h"
+#include "server-task.h"
+
+#include "common.h"
+#include "llama.h"
+#include "chat.h"
+#include "sampling.h"
+#include "speculative.h"
+#include "json-schema-to-grammar.h"
+
+using json = nlohmann::ordered_json;
+
+//
+// task_params
+//
+
+json task_params::format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const {
+    json data = json::array();
+    for (const auto & lb : logit_bias) {
+        data.push_back(json{
+            {"bias", lb.bias},
+            {"token", lb.token},
+        });
+    }
+    return data;
+}
+
+json task_params::to_json(bool only_metrics) const {
+    std::vector<std::string> samplers;
+    samplers.reserve(sampling.samplers.size());
+    for (const auto & sampler : sampling.samplers) {
+        samplers.emplace_back(common_sampler_type_to_str(sampler));
+    }
+
+    json lora = json::array();
+    for (auto & it : this->lora) {
+        lora.push_back({{"id", it.first}, {"scale", it.second}});
+    }
+
+    if (only_metrics) {
+        return json {
+            {"seed",                      sampling.seed},
+            {"temperature",               sampling.temp},
+            {"dynatemp_range",            sampling.dynatemp_range},
+            {"dynatemp_exponent",         sampling.dynatemp_exponent},
+            {"top_k",                     sampling.top_k},
+            {"top_p",                     sampling.top_p},
+            {"min_p",                     sampling.min_p},
+            {"top_n_sigma",               sampling.top_n_sigma},
+            {"xtc_probability",           sampling.xtc_probability},
+            {"xtc_threshold",             sampling.xtc_threshold},
+            {"typical_p",                 sampling.typ_p},
+            {"repeat_last_n",             sampling.penalty_last_n},
+            {"repeat_penalty",            sampling.penalty_repeat},
+            {"presence_penalty",          sampling.penalty_present},
+            {"frequency_penalty",         sampling.penalty_freq},
+            {"dry_multiplier",            sampling.dry_multiplier},
+            {"dry_base",                  sampling.dry_base},
+            {"dry_allowed_length",        sampling.dry_allowed_length},
+            {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
+            {"mirostat",                  sampling.mirostat},
+            {"mirostat_tau",              sampling.mirostat_tau},
+            {"mirostat_eta",              sampling.mirostat_eta},
+            {"max_tokens",                n_predict},
+            {"n_predict",                 n_predict}, // TODO: deduplicate?
+            {"n_keep",                    n_keep},
+            {"n_discard",                 n_discard},
+            {"ignore_eos",                sampling.ignore_eos},
+            {"stream",                    stream},
+            {"n_probs",                   sampling.n_probs},
+            {"min_keep",                  sampling.min_keep},
+            {"chat_format",               common_chat_format_name(chat_parser_params.format)},
+            {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
+            {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
+            {"thinking_forced_open",      chat_parser_params.thinking_forced_open},
+            {"samplers",                  samplers},
+            {"speculative.n_max",         speculative.n_max},
+            {"speculative.n_min",         speculative.n_min},
+            {"speculative.p_min",         speculative.p_min},
+            {"speculative.type",          common_speculative_type_to_str(speculative.type)},
+            {"speculative.ngram_size_n",  speculative.ngram_size_n},
+            {"speculative.ngram_size_m",  speculative.ngram_size_m},
+            {"speculative.ngram_m_hits",  speculative.ngram_min_hits},
+            {"timings_per_token",         timings_per_token},
+            {"post_sampling_probs",       post_sampling_probs},
+            {"backend_sampling",          sampling.backend_sampling},
+            {"lora",                      lora},
+        };
+    }
+
+    auto grammar_triggers = json::array();
+    for (const auto & trigger : sampling.grammar_triggers) {
+        server_grammar_trigger ct(trigger);
+        grammar_triggers.push_back(ct.to_json());
+    }
+
+    return json {
+        {"seed",                      sampling.seed},
+        {"temperature",               sampling.temp},
+        {"dynatemp_range",            sampling.dynatemp_range},
+        {"dynatemp_exponent",         sampling.dynatemp_exponent},
+        {"top_k",                     sampling.top_k},
+        {"top_p",                     sampling.top_p},
+        {"min_p",                     sampling.min_p},
+        {"top_n_sigma",               sampling.top_n_sigma},
+        {"xtc_probability",           sampling.xtc_probability},
+        {"xtc_threshold",             sampling.xtc_threshold},
+        {"typical_p",                 sampling.typ_p},
+        {"repeat_last_n",             sampling.penalty_last_n},
+        {"repeat_penalty",            sampling.penalty_repeat},
+        {"presence_penalty",          sampling.penalty_present},
+        {"frequency_penalty",         sampling.penalty_freq},
+        {"dry_multiplier",            sampling.dry_multiplier},
+        {"dry_base",                  sampling.dry_base},
+        {"dry_allowed_length",        sampling.dry_allowed_length},
+        {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
+        {"dry_sequence_breakers",     sampling.dry_sequence_breakers},
+        {"mirostat",                  sampling.mirostat},
+        {"mirostat_tau",              sampling.mirostat_tau},
+        {"mirostat_eta",              sampling.mirostat_eta},
+        {"stop",                      antiprompt},
+        {"max_tokens",                n_predict},
+        {"n_predict",                 n_predict}, // TODO: deduplicate?
+        {"n_keep",                    n_keep},
+        {"n_discard",                 n_discard},
+        {"ignore_eos",                sampling.ignore_eos},
+        {"stream",                    stream},
+        {"logit_bias",                format_logit_bias(sampling.logit_bias)},
+        {"n_probs",                   sampling.n_probs},
+        {"min_keep",                  sampling.min_keep},
+        {"grammar",                   sampling.grammar},
+        {"grammar_lazy",              sampling.grammar_lazy},
+        {"grammar_triggers",          grammar_triggers},
+        {"preserved_tokens",          sampling.preserved_tokens},
+        {"chat_format",               common_chat_format_name(chat_parser_params.format)},
+        {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
+        {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
+        {"thinking_forced_open",      chat_parser_params.thinking_forced_open},
+        {"samplers",                  samplers},
+        {"speculative.n_max",         speculative.n_max},
+        {"speculative.n_min",         speculative.n_min},
+        {"speculative.p_min",         speculative.p_min},
+        {"speculative.type",          common_speculative_type_to_str(speculative.type)},
+        {"speculative.ngram_size_n",  speculative.ngram_size_n},
+        {"speculative.ngram_size_m",  speculative.ngram_size_m},
+        {"speculative.ngram_m_hits",  speculative.ngram_min_hits},
+        {"timings_per_token",         timings_per_token},
+        {"post_sampling_probs",       post_sampling_probs},
+        {"backend_sampling",          sampling.backend_sampling},
+        {"lora",                      lora},
+    };
+}
+
+//
+// task_result_state
+//
+common_chat_msg task_result_state::update_chat_msg(
+        const std::string & text_added,
+        bool is_partial,
+        std::vector<common_chat_msg_diff> & diffs) {
+    generated_text += text_added;
+    auto msg_prv_copy = chat_msg;
+    SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
+    auto new_msg = common_chat_parse(
+        generated_text,
+        is_partial,
+        chat_parser_params);
+    if (!new_msg.empty()) {
+        new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
+        chat_msg = new_msg;
+        diffs = common_chat_msg_diff::compute_diffs(msg_prv_copy, new_msg.empty() ? msg_prv_copy : new_msg);
+    }
+    return chat_msg;
+}
+
+//
+// server_task
+//
+
+task_params server_task::params_from_json_cmpl(
+        const llama_vocab * vocab,
+        const common_params & params_base,
+        const int n_ctx_slot,
+        const json & data) {
+    task_params params;
+
+    // Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
+    task_params defaults;
+    defaults.sampling      = params_base.sampling;
+    defaults.speculative   = params_base.speculative;
+    defaults.n_keep        = params_base.n_keep;
+    defaults.n_predict     = params_base.n_predict;
+    defaults.n_cache_reuse = params_base.n_cache_reuse;
+    defaults.cache_prompt  = params_base.cache_prompt;
+    defaults.antiprompt    = params_base.antiprompt;
+
+    // enabling this will output extra debug information in the HTTP responses from the server
+    params.verbose           = params_base.verbosity > 9;
+    params.timings_per_token = json_value(data, "timings_per_token", false);
+
+    params.stream           = json_value(data,       "stream",             false);
+    auto stream_opt         = json_value(data,       "stream_options",     json::object());
+    params.include_usage    = json_value(stream_opt, "include_usage",      false);
+    params.cache_prompt     = json_value(data,       "cache_prompt",       defaults.cache_prompt);
+    params.return_tokens    = json_value(data,       "return_tokens",      false);
+    params.return_progress  = json_value(data,       "return_progress",    false);
+    params.n_predict        = json_value(data,       "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
+    params.n_indent         = json_value(data,       "n_indent",           defaults.n_indent);
+    params.n_keep           = json_value(data,       "n_keep",             defaults.n_keep);
+    params.n_discard        = json_value(data,       "n_discard",          defaults.n_discard);
+    params.n_cmpl           = json_value(data,       "n_cmpl",             json_value(data, "n", 1));
+    params.n_cache_reuse    = json_value(data,       "n_cache_reuse",      defaults.n_cache_reuse);
+    //params.t_max_prompt_ms  = json_value(data,       "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
+    params.t_max_predict_ms = json_value(data,       "t_max_predict_ms",   defaults.t_max_predict_ms);
+    params.response_fields  = json_value(data,       "response_fields",    std::vector<std::string>());
+
+    params.sampling.top_k              = json_value(data, "top_k",               defaults.sampling.top_k);
+    params.sampling.top_p              = json_value(data, "top_p",               defaults.sampling.top_p);
+    params.sampling.min_p              = json_value(data, "min_p",               defaults.sampling.min_p);
+    params.sampling.top_n_sigma        = json_value(data, "top_n_sigma",         defaults.sampling.top_n_sigma);
+    params.sampling.xtc_probability    = json_value(data, "xtc_probability",     defaults.sampling.xtc_probability);
+    params.sampling.xtc_threshold      = json_value(data, "xtc_threshold",       defaults.sampling.xtc_threshold);
+    params.sampling.typ_p              = json_value(data, "typical_p",           defaults.sampling.typ_p);
+    params.sampling.temp               = json_value(data, "temperature",         defaults.sampling.temp);
+    params.sampling.dynatemp_range     = json_value(data, "dynatemp_range",      defaults.sampling.dynatemp_range);
+    params.sampling.dynatemp_exponent  = json_value(data, "dynatemp_exponent",   defaults.sampling.dynatemp_exponent);
+    params.sampling.penalty_last_n     = json_value(data, "repeat_last_n",       defaults.sampling.penalty_last_n);
+    params.sampling.penalty_repeat     = json_value(data, "repeat_penalty",      defaults.sampling.penalty_repeat);
+    params.sampling.penalty_freq       = json_value(data, "frequency_penalty",   defaults.sampling.penalty_freq);
+    params.sampling.penalty_present    = json_value(data, "presence_penalty",    defaults.sampling.penalty_present);
+    params.sampling.dry_multiplier     = json_value(data, "dry_multiplier",      defaults.sampling.dry_multiplier);
+    params.sampling.dry_base           = json_value(data, "dry_base",            defaults.sampling.dry_base);
+    params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length",  defaults.sampling.dry_allowed_length);
+    params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n",  defaults.sampling.dry_penalty_last_n);
+    params.sampling.mirostat           = json_value(data, "mirostat",            defaults.sampling.mirostat);
+    params.sampling.mirostat_tau       = json_value(data, "mirostat_tau",        defaults.sampling.mirostat_tau);
+    params.sampling.mirostat_eta       = json_value(data, "mirostat_eta",        defaults.sampling.mirostat_eta);
+    params.sampling.adaptive_target    = json_value(data, "adaptive_target",     defaults.sampling.adaptive_target);
+    params.sampling.adaptive_decay     = json_value(data, "adaptive_decay",      defaults.sampling.adaptive_decay);
+    params.sampling.seed               = json_value(data, "seed",                defaults.sampling.seed);
+    params.sampling.n_probs            = json_value(data, "n_probs",             defaults.sampling.n_probs);
+    params.sampling.min_keep           = json_value(data, "min_keep",            defaults.sampling.min_keep);
+    params.sampling.backend_sampling   = json_value(data, "backend_sampling",    defaults.sampling.backend_sampling);
+    params.post_sampling_probs         = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
+
+    params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
+    params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
+    params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
+
+    params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
+    params.speculative.n_min = std::max(params.speculative.n_min, 0);
+    params.speculative.n_max = std::max(params.speculative.n_max, 0);
+
+    params.speculative.type = common_speculative_type_from_name(json_value(data, "speculative.type", common_speculative_type_to_str(defaults.speculative.type)));
+
+    params.speculative.ngram_size_n     = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n);
+    params.speculative.ngram_size_m     = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m);
+    params.speculative.ngram_min_hits   = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits);
+
+    params.speculative.ngram_size_n     = std::max(std::min(1, (int) params.speculative.ngram_size_n),     1024);
+    params.speculative.ngram_size_m     = std::max(std::min(1, (int) params.speculative.ngram_size_m),     1024);
+    params.speculative.ngram_min_hits   = std::max(std::min(1, (int) params.speculative.ngram_min_hits),   1024);
+
+    // Use OpenAI API logprobs only if n_probs wasn't provided
+    if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
+        params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
+    }
+
+    if (data.contains("lora")) {
+        if (data.at("lora").is_array()) {
+            params.lora = parse_lora_request(data.at("lora"));
+        } else {
+            throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
+        }
+    } else {
+        params.lora = {};
+    }
+
+    // TODO: add more sanity checks for the input parameters
+
+    if (params.sampling.penalty_last_n < -1) {
+        throw std::runtime_error("Error: repeat_last_n must be >= -1");
+    }
+
+    if (params.sampling.dry_penalty_last_n < -1) {
+        throw std::runtime_error("Error: dry_penalty_last_n must be >= -1");
+    }
+
+    if (params.sampling.penalty_last_n == -1) {
+        // note: should be the slot's context and not the full context, but it's ok
+        params.sampling.penalty_last_n = n_ctx_slot;
+    }
+
+    if (params.sampling.dry_penalty_last_n == -1) {
+        params.sampling.dry_penalty_last_n = n_ctx_slot;
+    }
+
+    if (params.sampling.dry_base < 1.0f) {
+        params.sampling.dry_base = defaults.sampling.dry_base;
+    }
+
+    // sequence breakers for DRY
+    {
+        // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format
+        // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
+
+        if (data.contains("dry_sequence_breakers")) {
+            params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
+            if (params.sampling.dry_sequence_breakers.empty()) {
+                throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings");
+            }
+        }
+    }
+
+    // process "json_schema" and "grammar"
+    if (data.contains("json_schema") && !data.contains("grammar")) {
+        try {
+            auto schema                  = json_value(data, "json_schema", json::object());
+            SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
+            params.sampling.grammar      = json_schema_to_grammar(schema);
+            SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
+        } catch (const std::exception & e) {
+            throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
+        }
+    } else {
+        params.sampling.grammar      = json_value(data, "grammar", defaults.sampling.grammar);
+        SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
+        params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
+        SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
+    }
+
+    {
+        auto it = data.find("chat_format");
+        if (it != data.end()) {
+            params.chat_parser_params.format = static_cast<common_chat_format>(it->get<int>());
+            SRV_INF("Chat format: %s\n", common_chat_format_name(params.chat_parser_params.format));
+        } else {
+            params.chat_parser_params.format = defaults.chat_parser_params.format;
+        }
+        common_reasoning_format reasoning_format = params_base.reasoning_format;
+        if (data.contains("reasoning_format")) {
+            reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
+        }
+        params.chat_parser_params.reasoning_format = reasoning_format;
+        params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
+        params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
+        params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
+        if (data.contains("chat_parser")) {
+            params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
+        }
+    }
+
+    {
+        const auto preserved_tokens = data.find("preserved_tokens");
+        if (preserved_tokens != data.end()) {
+            for (const auto & t : *preserved_tokens) {
+                auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
+                if (ids.size() == 1) {
+                    SRV_DBG("Preserved token: %d\n", ids[0]);
+                    params.sampling.preserved_tokens.insert(ids[0]);
+                } else {
+                    // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
+                    SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
+                }
+            }
+        }
+        const auto grammar_triggers = data.find("grammar_triggers");
+        if (grammar_triggers != data.end()) {
+            for (const auto & t : *grammar_triggers) {
+                server_grammar_trigger ct(t);
+                if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
+                    const auto & word = ct.value.value;
+                    auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
+                    if (ids.size() == 1) {
+                        auto token = ids[0];
+                        if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
+                            throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
+                        }
+                        SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
+                        common_grammar_trigger trigger;
+                        trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
+                        trigger.value = word;
+                        trigger.token = token;
+                        params.sampling.grammar_triggers.push_back(std::move(trigger));
+                    } else {
+                        SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
+                        params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
+                    }
+                } else {
+                    if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) {
+                        SRV_DBG("Grammar trigger pattern: `%s`\n", ct.value.value.c_str());
+                    } else if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL) {
+                        SRV_DBG("Grammar trigger pattern full: `%s`\n", ct.value.value.c_str());
+                    } else {
+                        throw std::runtime_error("Unknown grammar trigger type");
+                    }
+                    params.sampling.grammar_triggers.emplace_back(std::move(ct.value));
+                }
+            }
+        }
+        if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
+            throw std::runtime_error("Error: no triggers set for lazy grammar!");
+        }
+    }
+
+    {
+        params.sampling.logit_bias.clear();
+
+        const auto & logit_bias = data.find("logit_bias");
+        if (logit_bias != data.end() && logit_bias->is_array()) {
+            const int n_vocab = llama_vocab_n_tokens(vocab);
+            for (const auto & el : *logit_bias) {
+                // TODO: we may want to throw errors here, in case "el" is incorrect
+                if (el.is_array() && el.size() == 2) {
+                    float bias;
+                    if (el[1].is_number()) {
+                        bias = el[1].get<float>();
+                    } else if (el[1].is_boolean() && !el[1].get<bool>()) {
+                        bias = -INFINITY;
+                    } else {
+                        continue;
+                    }
+
+                    if (el[0].is_number_integer()) {
+                        llama_token tok = el[0].get<llama_token>();
+                        if (tok >= 0 && tok < n_vocab) {
+                            params.sampling.logit_bias.push_back({tok, bias});
+                        }
+                    } else if (el[0].is_string()) {
+                        auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
+                        for (auto tok : toks) {
+                            params.sampling.logit_bias.push_back({tok, bias});
+                        }
+                    }
+                }
+            }
+        } else if (logit_bias != data.end() && logit_bias->is_object()) {
+            const int n_vocab = llama_vocab_n_tokens(vocab);
+            for (const auto & el : logit_bias->items()) {
+                float bias;
+                const auto & key = el.key();
+                const auto & value = el.value();
+                if (value.is_number()) {
+                    bias = value.get<float>();
+                } else if (value.is_boolean() && !value.get<bool>()) {
+                    bias = -INFINITY;
+                } else {
+                    continue;
+                }
+
+                char *end;
+                llama_token tok = strtol(key.c_str(), &end, 10);
+                if (*end == 0) {
+                    if (tok >= 0 && tok < n_vocab) {
+                        params.sampling.logit_bias.push_back({tok, bias});
+                    }
+                } else {
+                    auto toks = common_tokenize(vocab, key, false);
+                    for (auto tok : toks) {
+                        params.sampling.logit_bias.push_back({tok, bias});
+                    }
+                }
+            }
+        }
+
+        params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
+        if (params.sampling.ignore_eos) {
+            params.sampling.logit_bias.insert(
+                    params.sampling.logit_bias.end(),
+                    defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end());
+        }
+    }
+
+    {
+        params.antiprompt.clear();
+
+        const auto & stop = data.find("stop");
+        if (stop != data.end() && stop->is_array()) {
+            for (const auto & word : *stop) {
+                if (!word.empty()) {
+                    params.antiprompt.push_back(word);
+                }
+            }
+        }
+        // set reverse prompt from cli args if not set in the request
+        if (params.antiprompt.empty()) {
+            params.antiprompt = defaults.antiprompt;
+        }
+    }
+
+    {
+        const auto samplers = data.find("samplers");
+        if (samplers != data.end()) {
+            if (samplers->is_array()) {
+                params.sampling.samplers = common_sampler_types_from_names(*samplers, false);
+            } else if (samplers->is_string()){
+                params.sampling.samplers = common_sampler_types_from_chars(samplers->get<std::string>());
+            }
+        } else {
+            params.sampling.samplers = defaults.sampling.samplers;
+        }
+    }
+
+    if (params.n_cmpl > params_base.n_parallel) {
+        throw std::runtime_error("n_cmpl cannot be greater than the number of slots, please increase -np");
+    }
+
+    return params;
+}
+
+//
+// result_timings
+//
+
+json result_timings::to_json() const {
+    json base = {
+        {"cache_n",                cache_n},
+
+        {"prompt_n",               prompt_n},
+        {"prompt_ms",              prompt_ms},
+        {"prompt_per_token_ms",    prompt_per_token_ms},
+        {"prompt_per_second",      prompt_per_second},
+
+        {"predicted_n",            predicted_n},
+        {"predicted_ms",           predicted_ms},
+        {"predicted_per_token_ms", predicted_per_token_ms},
+        {"predicted_per_second",   predicted_per_second},
+    };
+
+    if (draft_n > 0) {
+        base["draft_n"] = draft_n;
+        base["draft_n_accepted"] = draft_n_accepted;
+    }
+
+    return base;
+}
+
+//
+// result_prompt_progress
+//
+json result_prompt_progress::to_json() const {
+    return json {
+        {"total",     total},
+        {"cache",     cache},
+        {"processed", processed},
+        {"time_ms",   time_ms},
+    };
+}
+
+static inline std::string stop_type_to_str(stop_type type) {
+    switch (type) {
+        case STOP_TYPE_EOS:   return "eos";
+        case STOP_TYPE_WORD:  return "word";
+        case STOP_TYPE_LIMIT: return "limit";
+        default:              return "none";
+    }
+}
+
+//
+// completion_token_output
+//
+
+json completion_token_output::to_json(bool post_sampling_probs) const {
+    json probs_for_token = json::array();
+    for (const auto & p : probs) {
+        std::string txt(p.txt);
+        txt.resize(validate_utf8(txt));
+        probs_for_token.push_back(json {
+            {"id",      p.tok},
+            {"token",   txt},
+            {"bytes",   str_to_bytes(p.txt)},
+            {
+                post_sampling_probs ? "prob" : "logprob",
+                post_sampling_probs ? p.prob : logarithm(p.prob)
+            },
+        });
+    }
+    return probs_for_token;
+}
+
+json completion_token_output::probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs) {
+    json out = json::array();
+    for (const auto & p : probs) {
+        std::string txt(p.text_to_send);
+        txt.resize(validate_utf8(txt));
+        out.push_back(json {
+            {"id",           p.tok},
+            {"token",        txt},
+            {"bytes",        str_to_bytes(p.text_to_send)},
+            {
+                post_sampling_probs ? "prob" : "logprob",
+                post_sampling_probs ? p.prob : logarithm(p.prob)
+            },
+            {
+                post_sampling_probs ? "top_probs" : "top_logprobs",
+                p.to_json(post_sampling_probs)
+            },
+        });
+    }
+    return out;
+}
+
+float completion_token_output::logarithm(float x) {
+    // nlohmann::json converts -inf to null, so we need to prevent that
+    return x == 0.0f ? std::numeric_limits<float>::lowest() : std::log(x);
+}
+
+std::vector<unsigned char> completion_token_output::str_to_bytes(const std::string & str) {
+    std::vector<unsigned char> bytes;
+    for (unsigned char c : str) {
+        bytes.push_back(c);
+    }
+    return bytes;
+}
+
+//
+// server_task_result_cmpl_final
+//
+json server_task_result_cmpl_final::to_json() {
+    GGML_ASSERT(is_updated && "update() must be called before to_json()");
+    switch (res_type) {
+        case TASK_RESPONSE_TYPE_NONE:
+            return to_json_non_oaicompat();
+        case TASK_RESPONSE_TYPE_OAI_CMPL:
+            return to_json_oaicompat();
+        case TASK_RESPONSE_TYPE_OAI_CHAT:
+            return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
+        case TASK_RESPONSE_TYPE_OAI_RESP:
+            return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
+        case TASK_RESPONSE_TYPE_ANTHROPIC:
+            return stream ? to_json_anthropic_stream() : to_json_anthropic();
+        default:
+            GGML_ASSERT(false && "Invalid task_response_type");
+    }
+}
+
+json server_task_result_cmpl_final::to_json_non_oaicompat() {
+    json res = json {
+        {"index",               index},
+        {"content",             content},
+        {"tokens",              tokens},
+        {"id_slot",             id_slot},
+        {"stop",                true},
+        {"model",               oaicompat_model},
+        {"tokens_predicted",    n_decoded},
+        {"tokens_evaluated",    n_prompt_tokens},
+        {"generation_settings", generation_params.to_json()},
+        {"prompt",              prompt},
+        {"has_new_line",        has_new_line},
+        {"truncated",           truncated},
+        {"stop_type",           stop_type_to_str(stop)},
+        {"stopping_word",       stopping_word},
+        {"tokens_cached",       n_tokens_cached},
+        {"timings",             timings.to_json()},
+    };
+    if (!stream && !probs_output.empty()) {
+        res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
+    }
+    return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
+}
+
+json server_task_result_cmpl_final::to_json_oaicompat() {
+    std::time_t t = std::time(0);
+    json logprobs = json(nullptr); // OAI default to null
+    if (!stream && probs_output.size() > 0) {
+        logprobs = json{
+            {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
+        };
+    }
+    json finish_reason = "length";
+    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+        finish_reason = "stop";
+    }
+    json res = json {
+        {"choices",            json::array({
+            json{
+                {"text",          content},
+                {"index",         index},
+                {"logprobs",      logprobs},
+                {"finish_reason", finish_reason},
+            }
+        })},
+        {"created",            t},
+        {"model",              oaicompat_model},
+        {"system_fingerprint", build_info},
+        {"object",             "text_completion"},
+        {"usage", json {
+            {"completion_tokens", n_decoded},
+            {"prompt_tokens",     n_prompt_tokens},
+            {"total_tokens",      n_decoded + n_prompt_tokens}
+        }},
+        {"id", oaicompat_cmpl_id}
+    };
+
+    // extra fields for debugging purposes
+    if (verbose) {
+        res["__verbose"] = to_json_non_oaicompat();
+    }
+    if (timings.prompt_n >= 0) {
+        res.push_back({"timings", timings.to_json()});
+    }
+
+    return res;
+}
+
+json server_task_result_cmpl_final::to_json_oaicompat_chat() {
+    std::string finish_reason = "length";
+    common_chat_msg msg;
+    if (!oaicompat_msg.empty()) {
+        msg = oaicompat_msg;
+    } else {
+        msg.role = "assistant";
+        msg.content = content;
+    }
+    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+        finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
+    }
+
+    json choice {
+        {"finish_reason", finish_reason},
+        {"index", index},
+        {"message", msg.to_json_oaicompat()},
+    };
+
+    if (!stream && probs_output.size() > 0) {
+        choice["logprobs"] = json{
+            {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
+        };
+    }
+
+    std::time_t t = std::time(0);
+
+    json res = json {
+        {"choices",            json::array({choice})},
+        {"created",            t},
+        {"model",              oaicompat_model},
+        {"system_fingerprint", build_info},
+        {"object",             "chat.completion"},
+        {"usage", json {
+            {"completion_tokens", n_decoded},
+            {"prompt_tokens",     n_prompt_tokens},
+            {"total_tokens",      n_decoded + n_prompt_tokens}
+        }},
+        {"id", oaicompat_cmpl_id}
+    };
+
+    // extra fields for debugging purposes
+    if (verbose) {
+        res["__verbose"] = to_json_non_oaicompat();
+    }
+    if (timings.prompt_n >= 0) {
+        res.push_back({"timings", timings.to_json()});
+    }
+
+    return res;
+}
+
+json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
+    std::time_t t = std::time(0);
+    std::string finish_reason = "length";
+    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+        finish_reason = oaicompat_msg.tool_calls.empty() ? "stop" : "tool_calls";
+    }
+
+    json deltas = json::array();
+    for (const auto & diff : oaicompat_msg_diffs) {
+        deltas.push_back({
+            {"choices", json::array({
+                json {
+                    {"finish_reason", nullptr},
+                    {"index", 0},
+                    {"delta", common_chat_msg_diff_to_json_oaicompat(diff)},
+                },
+            })},
+            {"created", t},
+            {"id", oaicompat_cmpl_id},
+            {"model", oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object", "chat.completion.chunk"},
+        });
+    }
+
+    deltas.push_back({
+        {"choices", json::array({
+            json {
+                {"finish_reason", finish_reason},
+                {"index", 0},
+                {"delta", json::object()},
+            },
+        })},
+        {"created",            t},
+        {"id",                 oaicompat_cmpl_id},
+        {"model",              oaicompat_model},
+        {"system_fingerprint", build_info},
+        {"object",             "chat.completion.chunk"},
+    });
+
+    if (include_usage) {
+        // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
+        // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
+        deltas.push_back({
+            {"choices", json::array()},
+            {"created",            t},
+            {"id",                 oaicompat_cmpl_id},
+            {"model",              oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object",             "chat.completion.chunk"},
+            {"usage", json {
+                {"completion_tokens", n_decoded},
+                {"prompt_tokens",     n_prompt_tokens},
+                {"total_tokens",      n_decoded + n_prompt_tokens},
+            }},
+        });
+    }
+
+    if (timings.prompt_n >= 0) {
+        deltas.back().push_back({"timings", timings.to_json()});
+    }
+
+    // extra fields for debugging purposes
+    if (verbose && !deltas.empty()) {
+        deltas.front()["__verbose"] = to_json_non_oaicompat();
+    }
+
+    return deltas;
+}
+
+json server_task_result_cmpl_final::to_json_oaicompat_resp() {
+    common_chat_msg msg;
+    if (!oaicompat_msg.empty()) {
+        msg = oaicompat_msg;
+    } else {
+        msg.role = "assistant";
+        msg.content = content;
+    }
+
+    std::vector<json> output;
+
+    if (msg.reasoning_content != "") {
+        output.push_back(json {
+            {"id",      "rs_" + random_string()},
+            {"summary", json::array()},
+            {"type",    "reasoning"},
+            {"content", json::array({ json {
+                {"text", msg.reasoning_content},
+                {"type", "reasoning_text"},
+            }})},
+            {"encrypted_content", ""},
+            {"status",            "completed"},
+        });
+    }
+
+    if (msg.content != "") {
+        output.push_back(json {
+            {"content", json::array({ json {
+                {"type",        "output_text"},
+                {"annotations", json::array()},
+                {"logprobs",    json::array()},
+                {"text",        msg.content},
+            }})},
+            {"id",     "msg_" + random_string()},
+            {"role",   msg.role},
+            {"status", "completed"},
+            {"type",   "message"},
+        });
+    }
+
+    for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
+        output.push_back(json {
+            {"type",      "function_call"},
+            {"status",    "completed"},
+            {"arguments", tool_call.arguments},
+            {"call_id",   "fc_" + tool_call.id},
+            {"name",      tool_call.name},
+        });
+    }
+
+    std::time_t t = std::time(0);
+    json res = {
+        {"completed_at", t},
+        {"created_at",   t},
+        {"id",           oai_resp_id},
+        {"model",        oaicompat_model},
+        {"object",       "response"},
+        {"output",       output},
+        {"status",       "completed"},
+        {"usage",        json {
+            {"input_tokens",  n_prompt_tokens},
+            {"output_tokens", n_decoded},
+            {"total_tokens",  n_decoded + n_prompt_tokens},
+        }},
+    };
+
+    return res;
+}
+
+json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
+    std::vector<json> server_sent_events;
+    std::vector<json> output;
+
+    if (oaicompat_msg.reasoning_content != "") {
+        const json output_item = json {
+            {"id",      oai_resp_reasoning_id},
+            {"summary", json::array()},
+            {"type",    "reasoning"},
+            {"content", json::array({ json {
+                {"text", oaicompat_msg.reasoning_content},
+                {"type", "reasoning_text"},
+            }})},
+            {"encrypted_content", ""},
+        };
+
+        server_sent_events.push_back(json {
+            {"event", "response.output_item.done"},
+            {"data", json {
+                {"type", "response.output_item.done"},
+                {"item", output_item}
+            }}
+        });
+        output.push_back(output_item);
+    }
+
+    if (oaicompat_msg.content != "") {
+        server_sent_events.push_back(json {
+            {"event", "response.output_text.done"},
+            {"data", json {
+                {"type",    "response.output_text.done"},
+                {"item_id", oai_resp_message_id},
+                {"text",    oaicompat_msg.content}
+            }}
+        });
+
+        const json content_part = {
+            {"type",        "output_text"},
+            {"annotations", json::array()},
+            {"logprobs",    json::array()},
+            {"text",        oaicompat_msg.content}
+        };
+
+        server_sent_events.push_back(json {
+            {"event", "response.content_part.done"},
+            {"data", json {
+                {"type",    "response.content_part.done"},
+                {"item_id", oai_resp_message_id},
+                {"part",    content_part}
+            }}
+        });
+        const json output_item = {
+            {"type",    "message"},
+            {"status",  "completed"},
+            {"id",      oai_resp_message_id},
+            {"content", json::array({content_part})},
+            {"role",    "assistant"}
+        };
+
+        server_sent_events.push_back(json {
+            {"event", "response.output_item.done"},
+            {"data", json {
+                {"type", "response.output_item.done"},
+                {"item", output_item}
+            }}
+        });
+        output.push_back(output_item);
+    }
+
+    for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
+        const json output_item = {
+            {"type",      "function_call"},
+            {"status",    "completed"},
+            {"arguments", tool_call.arguments},
+            {"call_id",   "fc_" + tool_call.id},
+            {"name",      tool_call.name}
+        };
+        server_sent_events.push_back(json {
+            {"event", "response.output_item.done"},
+            {"data", json {
+                {"type", "response.output_item.done"},
+                {"item", output_item}
+            }}
+        });
+        output.push_back(output_item);
+    }
+
+    std::time_t t = std::time(0);
+    server_sent_events.push_back(json {
+        {"event", "response.completed"},
+        {"data", json {
+            {"type", "response.completed"},
+            {"response", json {
+                {"id",         oai_resp_id},
+                {"object",     "response"},
+                {"created_at", t},
+                {"status",     "completed"},
+                {"model",      oaicompat_model},
+                {"output",     output},
+                {"usage",      json {
+                    {"input_tokens",  n_prompt_tokens},
+                    {"output_tokens", n_decoded},
+                    {"total_tokens",  n_decoded + n_prompt_tokens}
+                }}
+            }},
+        }}
+    });
+
+    return server_sent_events;
+}
+
+json server_task_result_cmpl_final::to_json_anthropic() {
+    std::string stop_reason = "max_tokens";
+    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+        stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
+    }
+
+    json content_blocks = json::array();
+
+    common_chat_msg msg;
+    if (!oaicompat_msg.empty()) {
+        msg = oaicompat_msg;
+    } else {
+        msg.role = "assistant";
+        msg.content = content;
+    }
+
+    // thinking block comes first (Anthropic extended thinking format)
+    if (!msg.reasoning_content.empty()) {
+        content_blocks.push_back({
+            {"type", "thinking"},
+            {"thinking", msg.reasoning_content},
+            {"signature", ""}  // empty signature for local models (no cryptographic verification)
+        });
+    }
+
+    if (!msg.content.empty()) {
+        content_blocks.push_back({
+            {"type", "text"},
+            {"text", msg.content}
+        });
+    }
+
+    for (const auto & tool_call : msg.tool_calls) {
+        json tool_use_block = {
+            {"type", "tool_use"},
+            {"id", tool_call.id},
+            {"name", tool_call.name}
+        };
+
+        try {
+            tool_use_block["input"] = json::parse(tool_call.arguments);
+        } catch (const std::exception &) {
+            tool_use_block["input"] = json::object();
+        }
+
+        content_blocks.push_back(tool_use_block);
+    }
+
+    json res = {
+        {"id", oaicompat_cmpl_id},
+        {"type", "message"},
+        {"role", "assistant"},
+        {"content", content_blocks},
+        {"model", oaicompat_model},
+        {"stop_reason", stop_reason},
+        {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)},
+        {"usage", {
+            {"input_tokens", n_prompt_tokens},
+            {"output_tokens", n_decoded}
+        }}
+    };
+
+    return res;
+}
+
+json server_task_result_cmpl_final::to_json_anthropic_stream() {
+    json events = json::array();
+
+    std::string stop_reason = "max_tokens";
+    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+        stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
+    }
+
+    bool has_thinking = !oaicompat_msg.reasoning_content.empty();
+    bool has_text     = !oaicompat_msg.content.empty();
+    size_t num_tool_calls = oaicompat_msg.tool_calls.size();
+
+    // content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
+    size_t thinking_block_index = 0;
+    size_t text_block_index     = has_thinking ? 1 : 0;
+
+    bool thinking_block_started = false;
+    bool text_block_started     = false;
+    std::unordered_set<size_t> tool_calls_started;
+
+    for (const auto & diff : oaicompat_msg_diffs) {
+        // handle thinking/reasoning content
+        if (!diff.reasoning_content_delta.empty()) {
+            if (!thinking_block_started) {
+                events.push_back({
+                    {"event", "content_block_start"},
+                    {"data", {
+                        {"type", "content_block_start"},
+                        {"index", thinking_block_index},
+                        {"content_block", {
+                            {"type", "thinking"},
+                            {"thinking", ""}
+                        }}
+                    }}
+                });
+                thinking_block_started = true;
+            }
+
+            events.push_back({
+                {"event", "content_block_delta"},
+                {"data", {
+                    {"type", "content_block_delta"},
+                    {"index", thinking_block_index},
+                    {"delta", {
+                        {"type", "thinking_delta"},
+                        {"thinking", diff.reasoning_content_delta}
+                    }}
+                }}
+            });
+        }
+
+        // handle regular text content
+        if (!diff.content_delta.empty()) {
+            if (!text_block_started) {
+                events.push_back({
+                    {"event", "content_block_start"},
+                    {"data", {
+                        {"type", "content_block_start"},
+                        {"index", text_block_index},
+                        {"content_block", {
+                            {"type", "text"},
+                            {"text", ""}
+                        }}
+                    }}
+                });
+                text_block_started = true;
+            }
+
+            events.push_back({
+                {"event", "content_block_delta"},
+                {"data", {
+                    {"type", "content_block_delta"},
+                    {"index", text_block_index},
+                    {"delta", {
+                        {"type", "text_delta"},
+                        {"text", diff.content_delta}
+                    }}
+                }}
+            });
+        }
+
+        // handle tool calls
+        if (diff.tool_call_index != std::string::npos) {
+            size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + diff.tool_call_index;
+
+            if (tool_calls_started.find(diff.tool_call_index) == tool_calls_started.end()) {
+                const auto & full_tool_call = oaicompat_msg.tool_calls[diff.tool_call_index];
+
+                events.push_back({
+                    {"event", "content_block_start"},
+                    {"data", {
+                        {"type", "content_block_start"},
+                        {"index", content_block_index},
+                        {"content_block", {
+                            {"type", "tool_use"},
+                            {"id", full_tool_call.id},
+                            {"name", full_tool_call.name}
+                        }}
+                    }}
+                });
+                tool_calls_started.insert(diff.tool_call_index);
+            }
+
+            if (!diff.tool_call_delta.arguments.empty()) {
+                events.push_back({
+                    {"event", "content_block_delta"},
+                    {"data", {
+                        {"type", "content_block_delta"},
+                        {"index", content_block_index},
+                        {"delta", {
+                            {"type", "input_json_delta"},
+                            {"partial_json", diff.tool_call_delta.arguments}
+                        }}
+                    }}
+                });
+            }
+        }
+    }
+
+    // close content blocks in order
+    if (has_thinking) {
+        // Anthropic API requires a signature_delta before closing thinking blocks
+        // We use an empty signature since we can't generate a cryptographic signature for local models
+        events.push_back({
+            {"event", "content_block_delta"},
+            {"data", {
+                {"type", "content_block_delta"},
+                {"index", thinking_block_index},
+                {"delta", {
+                    {"type", "signature_delta"},
+                    {"signature", ""}
+                }}
+            }}
+        });
+        events.push_back({
+            {"event", "content_block_stop"},
+            {"data", {
+                {"type", "content_block_stop"},
+                {"index", thinking_block_index}
+            }}
+        });
+    }
+
+    if (has_text) {
+        events.push_back({
+            {"event", "content_block_stop"},
+            {"data", {
+                {"type", "content_block_stop"},
+                {"index", text_block_index}
+            }}
+        });
+    }
+
+    for (size_t i = 0; i < num_tool_calls; i++) {
+        size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + i;
+        events.push_back({
+            {"event", "content_block_stop"},
+            {"data", {
+                {"type", "content_block_stop"},
+                {"index", content_block_index}
+            }}
+        });
+    }
+
+    events.push_back({
+        {"event", "message_delta"},
+        {"data", {
+            {"type", "message_delta"},
+            {"delta", {
+                {"stop_reason", stop_reason},
+                {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)}
+            }},
+            {"usage", {
+                {"output_tokens", n_decoded}
+            }}
+        }}
+    });
+
+    events.push_back({
+        {"event", "message_stop"},
+        {"data", {
+            {"type", "message_stop"}
+        }}
+    });
+
+    return events;
+}
+
+//
+// server_task_result_cmpl_partial
+//
+void server_task_result_cmpl_partial::update(task_result_state & state) {
+    is_updated = true;
+    state.update_chat_msg(content, true, oaicompat_msg_diffs);
+
+    // Copy current state for use in to_json_*() (reflects state BEFORE this chunk)
+    thinking_block_started = state.thinking_block_started;
+    text_block_started     = state.text_block_started;
+
+    oai_resp_id            = state.oai_resp_id;
+    oai_resp_reasoning_id  = state.oai_resp_reasoning_id;
+    oai_resp_message_id    = state.oai_resp_message_id;
+    oai_resp_fc_id         = state.oai_resp_fc_id;
+
+    // track if the accumulated message has any reasoning content
+    anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
+
+    // Pre-compute state updates based on diffs (for next chunk)
+    for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
+        if (!diff.reasoning_content_delta.empty() && !state.thinking_block_started) {
+            state.thinking_block_started = true;
+        }
+        if (!diff.content_delta.empty() && !state.text_block_started) {
+            state.text_block_started = true;
+        }
+        if (!diff.tool_call_delta.name.empty()) {
+            state.oai_resp_fc_id = diff.tool_call_delta.id;
+        }
+    }
+}
+
+json server_task_result_cmpl_partial::to_json() {
+    GGML_ASSERT(is_updated && "update() must be called before to_json()");
+    switch (res_type) {
+        case TASK_RESPONSE_TYPE_NONE:
+            return to_json_non_oaicompat();
+        case TASK_RESPONSE_TYPE_OAI_CMPL:
+            return to_json_oaicompat();
+        case TASK_RESPONSE_TYPE_OAI_CHAT:
+            return to_json_oaicompat_chat();
+        case TASK_RESPONSE_TYPE_OAI_RESP:
+            return to_json_oaicompat_resp();
+        case TASK_RESPONSE_TYPE_ANTHROPIC:
+            return to_json_anthropic();
+        default:
+            GGML_ASSERT(false && "Invalid task_response_type");
+    }
+}
+
+json server_task_result_cmpl_partial::to_json_non_oaicompat() {
+    // non-OAI-compat JSON
+    json res = json {
+        {"index",            index},
+        {"content",          content},
+        {"tokens",           tokens},
+        {"stop",             false},
+        {"id_slot",          id_slot},
+        {"tokens_predicted", n_decoded},
+        {"tokens_evaluated", n_prompt_tokens},
+    };
+    // populate the timings object when needed (usually for the last response or with timings_per_token enabled)
+    if (timings.prompt_n > 0) {
+        res.push_back({"timings", timings.to_json()});
+    }
+    if (is_progress) {
+        res.push_back({"prompt_progress", progress.to_json()});
+    }
+    if (!prob_output.probs.empty()) {
+        res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
+    }
+    return res;
+}
+
+json server_task_result_cmpl_partial::to_json_oaicompat() {
+    std::time_t t = std::time(0);
+    json logprobs = json(nullptr); // OAI default to null
+    if (prob_output.probs.size() > 0) {
+        logprobs = json{
+            {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
+        };
+    }
+    json res = json {
+        {"choices",            json::array({
+            json{
+                {"text",          content},
+                {"index",         index},
+                {"logprobs",      logprobs},
+                {"finish_reason", nullptr},
+            }
+        })},
+        {"created",            t},
+        {"model",              oaicompat_model},
+        {"system_fingerprint", build_info},
+        {"object",             "text_completion"},
+        {"id",                 oaicompat_cmpl_id}
+    };
+
+    // extra fields for debugging purposes
+    if (verbose) {
+        res["__verbose"] = to_json_non_oaicompat();
+    }
+    if (timings.prompt_n >= 0) {
+        res.push_back({"timings", timings.to_json()});
+    }
+    if (is_progress) {
+        res.push_back({"prompt_progress", progress.to_json()});
+    }
+
+    return res;
+}
+
+json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
+    bool first = n_decoded == 1;
+    std::time_t t = std::time(0);
+    json choices;
+
+    std::vector<json> deltas;
+    auto add_delta = [&](const json & delta) {
+        deltas.push_back({
+            {"choices", json::array({
+                json {
+                    {"finish_reason", nullptr},
+                    {"index", index},
+                    {"delta", delta},
+                },
+            })},
+            {"created", t},
+            {"id", oaicompat_cmpl_id},
+            {"model", oaicompat_model},
+            {"system_fingerprint", build_info},
+            {"object", "chat.completion.chunk"},
+        });
+    };
+    // We have to send an initial update to conform to openai behavior
+    if (first || is_progress) {
+        add_delta({
+            {"role", "assistant"},
+            {"content", nullptr},
+        });
+    }
+
+    for (const auto & diff : oaicompat_msg_diffs) {
+        add_delta(common_chat_msg_diff_to_json_oaicompat(diff));
+    }
+
+    if (!deltas.empty()) {
+        auto & last_json = deltas[deltas.size() - 1];
+        GGML_ASSERT(last_json.at("choices").size() >= 1);
+
+        if (prob_output.probs.size() > 0) {
+            last_json.at("choices").at(0)["logprobs"] = json {
+                {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
+            };
+        }
+
+        if (timings.prompt_n >= 0) {
+            last_json.push_back({"timings", timings.to_json()});
+        }
+        if (is_progress) {
+            last_json.push_back({"prompt_progress", progress.to_json()});
+        }
+    }
+
+    return deltas;
+}
+
+json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
+    std::vector<json> events;
+
+    if (n_decoded == 1) {
+        events.push_back(json {
+            {"event", "response.created"},
+            {"data", json {
+                {"type", "response.created"},
+                {"response", json {
+                    {"id",     oai_resp_id},
+                    {"object", "response"},
+                    {"status", "in_progress"},
+                }},
+            }},
+        });
+        events.push_back(json {
+            {"event", "response.in_progress"},
+            {"data", json {
+                {"type", "response.in_progress"},
+                {"response", json {
+                    {"id",     oai_resp_id},
+                    {"object", "response"},
+                    {"status", "in_progress"},
+                }},
+            }},
+        });
+    }
+
+    for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
+        if (!diff.reasoning_content_delta.empty()) {
+            if (!thinking_block_started) {
+                events.push_back(json {
+                    {"event", "response.output_item.added"},
+                    {"data", json {
+                        {"type", "response.output_item.added"},
+                        {"item", json {
+                            {"id",                oai_resp_reasoning_id},
+                            {"summary",           json::array()},
+                            {"type",              "reasoning"},
+                            {"content",           json::array()},
+                            {"encrypted_content", ""},
+                            {"status",            "in_progress"},
+                        }},
+                    }},
+                });
+                thinking_block_started = true;
+            }
+            events.push_back(json {
+                {"event", "response.reasoning_text.delta"},
+                {"data", json {
+                    {"type",    "response.reasoning_text.delta"},
+                    {"delta",   diff.reasoning_content_delta},
+                    {"item_id", oai_resp_reasoning_id},
+                }},
+            });
+        }
+
+        if (!diff.content_delta.empty()) {
+            if (!text_block_started) {
+                events.push_back(json {
+                    {"event", "response.output_item.added"},
+                    {"data", json {
+                        {"type", "response.output_item.added"},
+                        {"item", json {
+                            {"content", json::array()},
+                            {"id",      oai_resp_message_id},
+                            {"role",    "assistant"},
+                            {"status",  "in_progress"},
+                            {"type",    "message"},
+                        }},
+                    }},
+                });
+                events.push_back(json {
+                    {"event", "response.content_part.added"},
+                    {"data", json {
+                        {"type",    "response.content_part.added"},
+                        {"item_id", oai_resp_message_id},
+                        {"part", json {
+                            {"type", "output_text"},
+                            {"text", ""},
+                        }},
+                    }},
+                });
+                text_block_started = true;
+            }
+            events.push_back(json {
+                {"event", "response.output_text.delta"},
+                {"data", json {
+                    {"type",    "response.output_text.delta"},
+                    {"item_id", oai_resp_message_id},
+                    {"delta",   diff.content_delta},
+                }},
+            });
+        }
+
+        if (!diff.tool_call_delta.name.empty()) {
+            events.push_back(json {
+                {"event", "response.output_item.added"},
+                {"data", json {
+                    {"type",  "response.output_item.added"},
+                    {"item", json {
+                        {"arguments", ""},
+                        {"call_id",   "fc_" + diff.tool_call_delta.id},
+                        {"name",      diff.tool_call_delta.name},
+                        {"type",      "function_call"},
+                        {"status",    "in_progress"},
+                    }},
+                }},
+            });
+            oai_resp_fc_id = diff.tool_call_delta.id;
+        }
+
+        if (!diff.tool_call_delta.arguments.empty()) {
+            events.push_back(json {
+                {"event", "response.function_call_arguments.delta"},
+                {"data", json {
+                    {"type",    "response.function_call_arguments.delta"},
+                    {"delta",   diff.tool_call_delta.arguments},
+                    {"item_id", "fc_" + oai_resp_fc_id},
+                }},
+            });
+        }
+    }
+    return events;
+}
+
+json server_task_result_cmpl_partial::to_json_anthropic() {
+    json events = json::array();
+    bool first = (n_decoded == 1);
+    // use member variables to track block state across streaming calls
+    // (anthropic_thinking_block_started, anthropic_text_block_started)
+
+    if (first) {
+        events.push_back({
+            {"event", "message_start"},
+            {"data", {
+                {"type", "message_start"},
+                {"message", {
+                    {"id", oaicompat_cmpl_id},
+                    {"type", "message"},
+                    {"role", "assistant"},
+                    {"content", json::array()},
+                    {"model", oaicompat_model},
+                    {"stop_reason", nullptr},
+                    {"stop_sequence", nullptr},
+                    {"usage", {
+                        {"input_tokens", n_prompt_tokens},
+                        {"output_tokens", 0}
+                    }}
+                }}
+            }}
+        });
+    }
+
+    // content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
+    size_t thinking_block_index = 0;
+    // use anthropic_has_reasoning (set in update()) to know if ANY reasoning was generated
+    size_t text_block_index     = anthropic_has_reasoning ? 1 : 0;
+
+    // use local copies of streaming state (copied from task_result_state in update())
+    // these reflect the state BEFORE this chunk was processed
+    bool thinking_started = thinking_block_started;
+    bool text_started     = text_block_started;
+
+    for (const auto & diff : oaicompat_msg_diffs) {
+        // handle thinking/reasoning content
+        if (!diff.reasoning_content_delta.empty()) {
+            if (!thinking_started) {
+                events.push_back({
+                    {"event", "content_block_start"},
+                    {"data", {
+                        {"type", "content_block_start"},
+                        {"index", thinking_block_index},
+                        {"content_block", {
+                            {"type", "thinking"},
+                            {"thinking", ""}
+                        }}
+                    }}
+                });
+                thinking_started = true;
+            }
+
+            events.push_back({
+                {"event", "content_block_delta"},
+                {"data", {
+                    {"type", "content_block_delta"},
+                    {"index", thinking_block_index},
+                    {"delta", {
+                        {"type", "thinking_delta"},
+                        {"thinking", diff.reasoning_content_delta}
+                    }}
+                }}
+            });
+        }
+
+        // handle regular text content
+        if (!diff.content_delta.empty()) {
+            if (!text_started) {
+                events.push_back({
+                    {"event", "content_block_start"},
+                    {"data", {
+                        {"type", "content_block_start"},
+                        {"index", text_block_index},
+                        {"content_block", {
+                            {"type", "text"},
+                            {"text", ""}
+                        }}
+                    }}
+                });
+                text_started = true;
+            }
+
+            events.push_back({
+                {"event", "content_block_delta"},
+                {"data", {
+                    {"type", "content_block_delta"},
+                    {"index", text_block_index},
+                    {"delta", {
+                        {"type", "text_delta"},
+                        {"text", diff.content_delta}
+                    }}
+                }}
+            });
+        }
+
+        // handle tool calls
+        if (diff.tool_call_index != std::string::npos) {
+            // use anthropic_has_reasoning for thinking block count (persists across calls)
+            size_t content_block_index = (anthropic_has_reasoning ? 1 : 0) + (text_started ? 1 : 0) + diff.tool_call_index;
+
+            if (!diff.tool_call_delta.name.empty()) {
+                events.push_back({
+                    {"event", "content_block_start"},
+                    {"data", {
+                        {"type", "content_block_start"},
+                        {"index", content_block_index},
+                        {"content_block", {
+                            {"type", "tool_use"},
+                            {"id", diff.tool_call_delta.id},
+                            {"name", diff.tool_call_delta.name}
+                        }}
+                    }}
+                });
+            }
+
+            if (!diff.tool_call_delta.arguments.empty()) {
+                events.push_back({
+                    {"event", "content_block_delta"},
+                    {"data", {
+                        {"type", "content_block_delta"},
+                        {"index", content_block_index},
+                        {"delta", {
+                            {"type", "input_json_delta"},
+                            {"partial_json", diff.tool_call_delta.arguments}
+                        }}
+                    }}
+                });
+            }
+        }
+    }
+
+    return events;
+}
+
+//
+// server_task_result_embd
+//
+json server_task_result_embd::to_json() {
+    return res_type == TASK_RESPONSE_TYPE_OAI_EMBD
+        ? to_json_oaicompat()
+        : to_json_non_oaicompat();
+}
+
+json server_task_result_embd::to_json_non_oaicompat() {
+    return json {
+        {"index",     index},
+        {"embedding", embedding},
+    };
+}
+
+json server_task_result_embd::to_json_oaicompat() {
+    return json {
+        {"index",            index},
+        {"embedding",        embedding[0]},
+        {"tokens_evaluated", n_tokens},
+    };
+}
+
+//
+// server_task_result_rerank
+//
+json server_task_result_rerank::to_json() {
+    return json {
+        {"index",            index},
+        {"score",            score},
+        {"tokens_evaluated", n_tokens},
+    };
+}
+
+//
+// server_task_result_error
+//
+json server_task_result_error::to_json() {
+    json res = format_error_response(err_msg, err_type);
+    if (err_type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) {
+        res["n_prompt_tokens"] = n_prompt_tokens;
+        res["n_ctx"]           = n_ctx;
+    }
+    return res;
+}
+
+//
+// server_task_result_metrics
+//
+json server_task_result_metrics::to_json() {
+    return json {
+        { "idle",                            n_idle_slots },
+        { "processing",                      n_processing_slots },
+        { "deferred",                        n_tasks_deferred },
+        { "t_start",                         t_start },
+
+        { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
+        { "t_tokens_generation_total",       t_tokens_generation_total },
+        { "n_tokens_predicted_total",        n_tokens_predicted_total },
+        { "t_prompt_processing_total",       t_prompt_processing_total },
+
+        { "n_tokens_max",                    n_tokens_max },
+
+        { "n_prompt_tokens_processed",       n_prompt_tokens_processed },
+        { "t_prompt_processing",             t_prompt_processing },
+        { "n_tokens_predicted",              n_tokens_predicted },
+        { "t_tokens_generation",             t_tokens_generation },
+
+        { "n_decode_total",                  n_decode_total },
+        { "n_busy_slots_total",              n_busy_slots_total },
+
+        { "slots",                           slots_data },
+    };
+}
+
+//
+// server_task_result_slot_save_load
+//
+json server_task_result_slot_save_load::to_json() {
+    if (is_save) {
+        return json {
+            { "id_slot",   id_slot },
+            { "filename",  filename },
+            { "n_saved",   n_tokens },
+            { "n_written", n_bytes },
+            { "timings", {
+                { "save_ms", t_ms }
+            }},
+        };
+    }
+
+    return json {
+        { "id_slot",    id_slot },
+        { "filename",   filename },
+        { "n_restored", n_tokens },
+        { "n_read",     n_bytes },
+        { "timings", {
+            { "restore_ms", t_ms }
+        }},
+    };
+}
+
+//
+// server_task_result_slot_erase
+//
+json server_task_result_slot_erase::to_json() {
+    return json {
+        { "id_slot",  id_slot },
+        { "n_erased", n_erased },
+    };
+}
+
+//
+// server_task_result_get_lora
+//
+
+json server_task_result_get_lora::to_json() {
+    json result = json::array();
+    for (size_t i = 0; i < loras.size(); ++i) {
+        auto & lora = loras[i];
+        json entry = {
+            {"id",            i},
+            {"path",          lora.info.path},
+            {"scale",         lora.info.scale},
+            {"task_name",     lora.info.task_name},
+            {"prompt_prefix", lora.info.prompt_prefix},
+        };
+        if (!lora.alora_invocation_tokens.empty()) {
+            entry["alora_invocation_string"] = lora.alora_invocation_string;
+            entry["alora_invocation_tokens"] = lora.alora_invocation_tokens;
+        }
+        result.push_back(std::move(entry));
+    }
+    return result;
+}
+
+//
+// server_task_result_apply_lora
+//
+
+json server_task_result_apply_lora::to_json() {
+    return json {{ "success", true }};
+}
+
+//
+// server_prompt_cache
+//
+size_t server_prompt_cache::size() const {
+    size_t res = 0;
+
+    for (const auto & state : states) {
+        res += state.size();
+    }
+
+    return res;
+}
+
+size_t server_prompt_cache::n_tokens() const {
+    size_t res = 0;
+
+    for (const auto & state : states) {
+        res += state.n_tokens();
+    }
+
+    return res;
+}
+
+server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t state_size) {
+    // first check if the current state is contained fully in the cache
+    for (auto it = states.begin(); it != states.end(); ++it) {
+        const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens);
+
+        if (cur_lcp_len == (int) prompt.tokens.size()) {
+            SRV_WRN("%s", " - prompt is already in the cache, skipping\n");
+            return nullptr;
+        }
+    }
+
+    // next, remove any cached prompts that are fully contained in the current prompt
+    for (auto it = states.begin(); it != states.end();) {
+        const int len = it->tokens.get_common_prefix(prompt.tokens);
+
+        if (len == (int) it->tokens.size()) {
+            SRV_WRN(" - removing obsolete cached prompt with length %d\n", len);
+
+            it = states.erase(it);
+        } else {
+            ++it;
+        }
+    }
+
+    std::vector<uint8_t> state_data;
+
+    // check if we can allocate enough memory for the new state
+    try {
+        state_data.resize(state_size);
+    } catch (const std::bad_alloc & e) {
+        SRV_ERR("failed to allocate memory for prompt cache state: %s\n", e.what());
+
+        limit_size = std::max<size_t>(1, 0.4*size());
+
+        SRV_WRN(" - cache size limit reduced to %.3f MiB\n", limit_size / (1024.0 * 1024.0));
+
+        update();
+
+        return nullptr;
+    }
+
+    // TODO: for some reason we can't copy server_tokens, so we have to do this workaround
+    auto & cur = states.emplace_back();
+    cur = {
+        /*.tokens      =*/ server_tokens(prompt.tokens.get_text_tokens(), false),
+        /*.data        =*/ std::move(state_data),
+        /*.checkpoints =*/ prompt.checkpoints,
+    };
+
+    return &cur;
+}
+
+bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) {
+    const int lcp_best = prompt.tokens.get_common_prefix(tokens_new);
+
+    float f_keep_best = float(lcp_best) / prompt.tokens.size();
+    float sim_best    = float(lcp_best) / tokens_new.size();
+
+    SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
+
+    auto it_best = states.end();
+
+    // find the most similar cached prompt, that would also preserve the most context
+    for (auto it = states.begin(); it != states.end(); ++it) {
+        const int lcp_cur = it->tokens.get_common_prefix(tokens_new);
+
+        const float f_keep_cur = float(lcp_cur) / it->tokens.size();
+        const float sim_cur    = float(lcp_cur) / tokens_new.size();
+
+        // don't trash large prompts
+        if (f_keep_cur < 0.25f) {
+            continue;
+        }
+
+        if (f_keep_best < f_keep_cur && sim_best < sim_cur) {
+            f_keep_best = f_keep_cur;
+            sim_best    = sim_cur;
+
+            it_best = it;
+        }
+    }
+
+    if (it_best != states.end()) {
+        SRV_WRN(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
+
+        const size_t size = it_best->data.size();
+        const size_t n = llama_state_seq_set_data_ext(ctx, it_best->data.data(), size, id_slot, 0);
+        if (n != size) {
+            SRV_WRN("failed to restore state with size %zu\n", size);
+
+            return false;
+        }
+
+        it_best->data.clear();
+        it_best->data.shrink_to_fit();
+
+        prompt = std::move(*it_best);
+
+        states.erase(it_best);
+    }
+
+    return true;
+}
+
+void server_prompt_cache::update() {
+    if (limit_size > 0) {
+        // always keep at least one state, regardless of the limits
+        while (states.size() > 1 && size() > limit_size) {
+            if (states.empty()) {
+                break;
+            }
+
+            SRV_WRN(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
+
+            states.pop_front();
+        }
+    }
+
+    // average size per token
+    const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
+
+    // dynamically increase the token limit if it can fit in the memory limit
+    const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
+
+    if (limit_tokens > 0) {
+        while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
+            if (states.empty()) {
+                break;
+            }
+
+            SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
+                    limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
+
+            states.pop_front();
+        }
+    }
+
+    SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
+            states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
+
+    for (const auto & state : states) {
+        SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
+                (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
+    }
+}
diff --git a/llama.cpp/tools/server/server-task.h b/llama.cpp/tools/server/server-task.h
new file mode 100644
index 0000000..a69e8f1
--- /dev/null
+++ b/llama.cpp/tools/server/server-task.h
@@ -0,0 +1,620 @@
+#pragma once
+
+#include "common.h"
+#include "llama.h"
+
+#include <string>
+#include <unordered_set>
+#include <list>
+#include <map>
+
+// TODO: prevent including the whole server-common.h as we only use server_tokens
+#include "server-common.h"
+
+using json = nlohmann::ordered_json;
+
+enum server_task_type {
+    SERVER_TASK_TYPE_COMPLETION,
+    SERVER_TASK_TYPE_EMBEDDING,
+    SERVER_TASK_TYPE_RERANK,
+    SERVER_TASK_TYPE_INFILL,
+    SERVER_TASK_TYPE_CANCEL,
+    SERVER_TASK_TYPE_NEXT_RESPONSE,
+    SERVER_TASK_TYPE_METRICS,
+    SERVER_TASK_TYPE_SLOT_SAVE,
+    SERVER_TASK_TYPE_SLOT_RESTORE,
+    SERVER_TASK_TYPE_SLOT_ERASE,
+    SERVER_TASK_TYPE_GET_LORA,
+    SERVER_TASK_TYPE_SET_LORA,
+};
+
+// TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common
+enum task_response_type {
+    TASK_RESPONSE_TYPE_NONE, // llama.cpp native format
+    TASK_RESPONSE_TYPE_OAI_CHAT,
+    TASK_RESPONSE_TYPE_OAI_CMPL,
+    TASK_RESPONSE_TYPE_OAI_RESP,
+    TASK_RESPONSE_TYPE_OAI_EMBD,
+    TASK_RESPONSE_TYPE_ANTHROPIC,
+};
+
+enum stop_type {
+    STOP_TYPE_NONE,
+    STOP_TYPE_EOS,
+    STOP_TYPE_WORD,
+    STOP_TYPE_LIMIT,
+};
+
+struct task_params {
+    bool stream          = true;
+    bool include_usage   = false;
+    bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt
+    bool return_tokens   = false;
+    bool return_progress = false;
+
+    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
+    int32_t n_predict = -1; // new tokens to predict
+    int32_t n_indent  =  0; // minimum line indentation for the generated text in number of whitespace characters
+    int32_t n_cmpl    =  1; // number of completions to generate from this prompt
+
+    int32_t n_cache_reuse = 0; // min chunk size to attempt reusing from the cache via KV shifting (0 = disabled)
+
+    int64_t t_max_prompt_ms  = -1; // TODO: implement
+    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
+
+    std::map<int, float> lora; // mapping adapter ID -> scale
+
+    std::vector<std::string> antiprompt;
+    std::vector<std::string> response_fields;
+
+    bool timings_per_token   = false;
+    bool post_sampling_probs = false;
+
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;
+
+    // response formatting
+    bool               verbose  = false;
+    task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
+    std::string        oaicompat_model;
+    std::string        oaicompat_cmpl_id;
+
+    // per-request parameters for chat parsing
+    common_chat_parser_params chat_parser_params;
+
+    // Embeddings
+    int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
+
+    json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const;
+    json to_json(bool only_metrics = false) const;
+};
+
+// struct for tracking the state of a task (e.g., for streaming)
+struct task_result_state {
+    // tracking diffs for partial tool calls
+    std::vector<common_chat_msg_diff> diffs;
+    common_chat_parser_params chat_parser_params;
+    common_chat_msg chat_msg;
+    std::string generated_text; // append new chunks of generated text here
+    std::vector<std::string> generated_tool_call_ids;
+
+    // for OpenAI Responses and Anthropic streaming API:
+    // track output item / content block state across chunks
+    bool thinking_block_started = false;
+    bool text_block_started = false;
+
+    // for OpenAI Responses streaming API
+    const std::string oai_resp_id;
+    const std::string oai_resp_reasoning_id;
+    const std::string oai_resp_message_id;
+    std::string oai_resp_fc_id; // function call ID for current args delta
+
+    task_result_state(const common_chat_parser_params & chat_parser_params)
+        : chat_parser_params(chat_parser_params)
+        , oai_resp_id("resp_" + random_string())
+        , oai_resp_reasoning_id("rs_" + random_string())
+        , oai_resp_message_id("msg_" + random_string()) {}
+
+    // parse partial tool calls and update the internal state
+    common_chat_msg update_chat_msg(
+        const std::string & text_added,
+        bool is_partial,
+        std::vector<common_chat_msg_diff> & diffs);
+};
+
+struct server_task {
+    int id = -1; // to be filled by server_queue
+
+    // TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
+    size_t index = 0; // used when there are multiple prompts (batch request)
+
+    // used by SERVER_TASK_TYPE_CANCEL
+    int id_target = -1;
+    int id_slot   = -1;
+
+    // used by parallel sampling (multiple completions from same prompt)
+    int id_parent  = -1;
+    // temporary store of child tasks for scheduling
+    // note: accessing to elements is invalid after the task is moved to server_slot
+    std::vector<server_task> child_tasks;
+
+    // used by SERVER_TASK_TYPE_INFERENCE
+    task_params   params;
+    server_tokens tokens;
+
+    // only used by CLI, this allow tokenizing CLI inputs on server side
+    // we need this because mtmd_context and vocab are not accessible outside of server_context
+    bool                    cli = false;
+    std::string             cli_prompt;
+    std::vector<raw_buffer> cli_files;
+
+    server_task_type type;
+
+    // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
+    struct slot_action {
+        int id_slot;
+        std::string filename;
+        std::string filepath;
+    };
+    slot_action slot_action;
+
+    // used by SERVER_TASK_TYPE_METRICS
+    bool metrics_reset_bucket = false;
+
+    // used by SERVER_TASK_TYPE_SET_LORA
+    std::map<int, float> set_lora; // mapping adapter ID -> scale
+
+    server_task() = default;
+
+    server_task(server_task_type type) : type(type) {}
+
+    int32_t n_tokens() const {
+        return tokens.size();
+    }
+
+    bool need_embd() const {
+        switch (type) {
+            case SERVER_TASK_TYPE_EMBEDDING:
+            case SERVER_TASK_TYPE_RERANK:
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    bool need_logits() const {
+        switch (type) {
+            case SERVER_TASK_TYPE_COMPLETION:
+            case SERVER_TASK_TYPE_INFILL:
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    bool need_sampling() const {
+        switch (type) {
+            case SERVER_TASK_TYPE_COMPLETION:
+            case SERVER_TASK_TYPE_INFILL:
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    static task_params params_from_json_cmpl(
+        const llama_vocab * vocab,
+        const common_params & params_base,
+        const int n_ctx_slot,
+        const json & data);
+
+    // utility function
+    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
+        std::unordered_set<int> ids(tasks.size());
+        for (size_t i = 0; i < tasks.size(); i++) {
+            ids.insert(tasks[i].id);
+            for (auto & child : tasks[i].child_tasks) {
+                ids.insert(child.id);
+            }
+        }
+        return ids;
+    }
+
+    void add_child(int id_parent, int id_child) {
+        server_task copy;
+
+        copy.id        = id_child;
+        copy.id_parent = id_parent;
+        copy.params    = params;
+        copy.type      = type;
+        copy.tokens    = tokens.clone();
+        copy.id_slot   = -1; // child tasks cannot specify slot
+
+        // use different sampling seed for each child
+        // note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
+        if (copy.params.sampling.seed != LLAMA_DEFAULT_SEED) {
+            copy.params.sampling.seed += (uint32_t)child_tasks.size() + 1;
+        }
+
+        child_tasks.push_back(std::move(copy));
+    }
+
+    // the task will be moved into queue, then onto slots
+    // however, the state must be kept by caller (e.g., HTTP thread)
+    task_result_state create_state() const {
+        return task_result_state(params.chat_parser_params);
+    }
+
+    bool is_parent() const {
+        return child_tasks.size() > 0;
+    }
+
+    bool is_child() const {
+        return id_parent != -1;
+    }
+};
+
+struct result_timings {
+    int32_t cache_n = -1;
+
+    int32_t prompt_n = -1;
+    double prompt_ms;
+    double prompt_per_token_ms;
+    double prompt_per_second;
+
+    int32_t predicted_n = -1;
+    double predicted_ms;
+    double predicted_per_token_ms;
+    double predicted_per_second;
+
+    // Optional speculative metrics - only included when > 0
+    int32_t draft_n = 0;
+    int32_t draft_n_accepted = 0;
+
+    json to_json() const;
+};
+
+struct result_prompt_progress {
+    int32_t total = 0;
+    int32_t cache = 0;
+    int32_t processed = 0;
+    int64_t time_ms = 0;
+
+    json to_json() const;
+};
+
+struct server_task_result {
+    int id           = -1;
+    int id_slot      = -1;
+
+    // TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
+    size_t index = 0; // to be used for batched tasks
+
+    virtual bool is_error() {
+        // only used by server_task_result_error
+        return false;
+    }
+    virtual bool is_stop() {
+        // only used by server_task_result_cmpl_*
+        return true;
+    }
+    virtual void update(task_result_state &) {
+        // only used by server_task_result_cmpl_*
+    }
+    virtual json to_json() = 0;
+    virtual ~server_task_result() = default;
+};
+
+// using shared_ptr for polymorphism of server_task_result
+using server_task_result_ptr = std::unique_ptr<server_task_result>;
+
+struct completion_token_output {
+    llama_token tok;
+    float prob;
+    std::string text_to_send;
+    struct prob_info {
+        llama_token tok;
+        std::string txt;
+        float prob;
+    };
+    std::vector<prob_info> probs;
+
+    json to_json(bool post_sampling_probs) const;
+
+    static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs);
+
+    static float logarithm(float x);
+
+    static std::vector<unsigned char> str_to_bytes(const std::string & str);
+
+};
+
+struct server_task_result_cmpl_final : server_task_result {
+    std::string content;
+    llama_tokens tokens;
+
+    bool stream;
+    bool include_usage;
+    result_timings timings;
+    std::string prompt;
+
+    bool truncated;
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
+    int32_t n_tokens_cached;
+    bool has_new_line;
+    std::string stopping_word;
+    stop_type stop = STOP_TYPE_NONE;
+
+    bool post_sampling_probs;
+    std::vector<completion_token_output> probs_output;
+    std::vector<std::string>  response_fields;
+
+    task_params generation_params;
+
+    // response formatting
+    bool               verbose  = false;
+    task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
+    std::string        oaicompat_model;
+    std::string        oaicompat_cmpl_id;
+    common_chat_msg    oaicompat_msg; // to be populated by update()
+
+    std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
+    bool is_updated = false;
+
+    // for OpenAI Responses API
+    std::string oai_resp_id;
+    std::string oai_resp_reasoning_id;
+    std::string oai_resp_message_id;
+
+    virtual bool is_stop() override {
+        return true; // in stream mode, final responses are considered stop
+    }
+
+    virtual json to_json() override;
+
+    virtual void update(task_result_state & state) override {
+        is_updated = true;
+        oaicompat_msg = state.update_chat_msg(content, false, oaicompat_msg_diffs);
+
+        oai_resp_id = state.oai_resp_id;
+        oai_resp_reasoning_id = state.oai_resp_reasoning_id;
+        oai_resp_message_id = state.oai_resp_message_id;
+    }
+
+    json to_json_non_oaicompat();
+
+    json to_json_oaicompat();
+
+    json to_json_oaicompat_chat();
+
+    json to_json_oaicompat_chat_stream();
+
+    json to_json_oaicompat_resp();
+
+    json to_json_oaicompat_resp_stream();
+
+    json to_json_anthropic();
+
+    json to_json_anthropic_stream();
+};
+
+struct server_task_result_cmpl_partial : server_task_result {
+    std::string  content;
+    llama_tokens tokens;
+
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
+
+    bool post_sampling_probs;
+    bool is_progress = false;
+    completion_token_output prob_output;
+    result_timings timings;
+    result_prompt_progress progress;
+
+    // response formatting
+    bool               verbose  = false;
+    task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
+    std::string        oaicompat_model;
+    std::string        oaicompat_cmpl_id;
+    std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
+    bool is_updated = false;
+
+    // Streaming state copied from task_result_state for this chunk
+    bool thinking_block_started = false;
+    bool text_block_started     = false;
+
+    // for OpenAI Responses API
+    std::string oai_resp_id;
+    std::string oai_resp_reasoning_id;
+    std::string oai_resp_message_id;
+    std::string oai_resp_fc_id;
+
+    // for Anthropic API: track if any reasoning content has been generated
+    bool anthropic_has_reasoning = false;
+
+    virtual bool is_stop() override {
+        return false; // in stream mode, partial responses are not considered stop
+    }
+
+    virtual void update(task_result_state & state) override;
+
+    virtual json to_json() override;
+
+    json to_json_non_oaicompat();
+
+    json to_json_oaicompat();
+
+    json to_json_oaicompat_chat();
+
+    json to_json_oaicompat_resp();
+
+    json to_json_anthropic();
+};
+
+struct server_task_result_embd : server_task_result {
+    std::vector<std::vector<float>> embedding;
+
+    int32_t n_tokens;
+
+    // response formatting
+    task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
+
+    virtual json to_json() override;
+
+    json to_json_non_oaicompat();
+
+    json to_json_oaicompat();
+};
+
+struct server_task_result_rerank : server_task_result {
+    float score = -1e6;
+
+    int32_t n_tokens;
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_error : server_task_result {
+    error_type err_type = ERROR_TYPE_SERVER;
+    std::string err_msg;
+
+    // for ERROR_TYPE_EXCEED_CONTEXT_SIZE
+    int32_t n_prompt_tokens = 0;
+    int32_t n_ctx           = 0;
+
+    virtual bool is_error() override {
+        return true;
+    }
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_metrics : server_task_result {
+    int n_idle_slots;
+    int n_processing_slots;
+    int n_tasks_deferred;
+    int64_t t_start;
+
+    // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
+    uint64_t n_prompt_tokens_processed_total = 0;
+    uint64_t t_prompt_processing_total       = 0;
+    uint64_t n_tokens_predicted_total        = 0;
+    uint64_t t_tokens_generation_total       = 0;
+
+    uint64_t n_tokens_max = 0;
+
+    uint64_t n_prompt_tokens_processed = 0;
+    uint64_t t_prompt_processing       = 0;
+
+    uint64_t n_tokens_predicted  = 0;
+    uint64_t t_tokens_generation = 0;
+
+    uint64_t n_decode_total     = 0;
+    uint64_t n_busy_slots_total = 0;
+
+    // while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
+    // therefore, we use json to temporarily store the slot.to_json() result
+    json slots_data = json::array();
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_slot_save_load : server_task_result {
+    std::string filename;
+    bool is_save; // true = save, false = load
+
+    size_t n_tokens;
+    size_t n_bytes;
+    double t_ms;
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_slot_erase : server_task_result {
+    size_t n_erased;
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_get_lora : server_task_result {
+    struct lora {
+        common_adapter_lora_info info;
+        std::string  alora_invocation_string;
+        llama_tokens alora_invocation_tokens;
+    };
+    std::vector<lora> loras;
+
+    virtual json to_json() override;
+};
+
+struct server_task_result_apply_lora : server_task_result {
+    virtual json to_json() override;
+};
+
+struct server_prompt_checkpoint {
+    llama_pos pos_min;
+    llama_pos pos_max;
+
+    std::vector<uint8_t> data;
+
+    size_t size() const {
+        return data.size();
+    }
+};
+
+struct server_prompt {
+    server_tokens tokens;
+
+    std::vector<uint8_t> data;
+
+    std::list<server_prompt_checkpoint> checkpoints;
+
+    size_t size() const {
+        size_t res = data.size();
+
+        for (const auto & checkpoint : checkpoints) {
+            res += checkpoint.size();
+        }
+
+        return res;
+    }
+
+    int n_tokens() const {
+        return tokens.size();
+    }
+
+    server_prompt clone() const {
+        return server_prompt {
+            tokens.clone(),
+            data,
+            checkpoints
+        };
+    }
+};
+
+struct server_prompt_cache {
+    server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) {
+        this->limit_size   = 1024ull*1024ull*(limit_size_mib < 0 ? 0 : limit_size_mib);
+        this->limit_tokens = limit_tokens;
+    }
+
+    std::list<server_prompt> states;
+
+    // in bytes, 0 = no limit
+    size_t limit_size = 0;
+
+    // in tokens, 0 = no limit
+    size_t limit_tokens = 0;
+
+    size_t size() const;
+
+    size_t n_tokens() const;
+
+    server_prompt * alloc(const server_prompt & prompt, size_t state_size);
+
+    bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot);
+
+    void update();
+};
diff --git a/llama.cpp/tools/server/server.cpp b/llama.cpp/tools/server/server.cpp
new file mode 100644
index 0000000..d3d4316
--- /dev/null
+++ b/llama.cpp/tools/server/server.cpp
@@ -0,0 +1,322 @@
+#include "server-context.h"
+#include "server-http.h"
+#include "server-models.h"
+
+#include "arg.h"
+#include "common.h"
+#include "llama.h"
+#include "log.h"
+
+#include <atomic>
+#include <exception>
+#include <signal.h>
+#include <thread> // for std::thread::hardware_concurrency
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+static std::function<void(int)> shutdown_handler;
+static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
+
+static inline void signal_handler(int signal) {
+    if (is_terminating.test_and_set()) {
+        // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
+        // this is for better developer experience, we can remove when the server is stable enough
+        fprintf(stderr, "Received second interrupt, terminating immediately.\n");
+        exit(1);
+    }
+
+    shutdown_handler(signal);
+}
+
+// wrapper function that handles exceptions and logs errors
+// this is to make sure handler_t never throws exceptions; instead, it returns an error response
+static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
+    return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr {
+        std::string message;
+        error_type error;
+        try {
+            return func(req);
+        } catch (const std::invalid_argument & e) {
+            // treat invalid_argument as invalid request (400)
+            error = ERROR_TYPE_INVALID_REQUEST;
+            message = e.what();
+        } catch (const std::exception & e) {
+            // treat other exceptions as server error (500)
+            error = ERROR_TYPE_SERVER;
+            message = e.what();
+        } catch (...) {
+            error = ERROR_TYPE_SERVER;
+            message = "unknown error";
+        }
+
+        auto res = std::make_unique<server_http_res>();
+        res->status = 500;
+        try {
+            json error_data = format_error_response(message, error);
+            res->status = json_value(error_data, "code", 500);
+            res->data = safe_json_to_str({{ "error", error_data }});
+            SRV_WRN("got exception: %s\n", res->data.c_str());
+        } catch (const std::exception & e) {
+            SRV_ERR("got another exception: %s | while handling exception: %s\n", e.what(), message.c_str());
+            res->data = "Internal Server Error";
+        }
+        return res;
+    };
+}
+
+int main(int argc, char ** argv) {
+    // own arguments required by this example
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
+        return 1;
+    }
+
+    // validate batch size for embeddings
+    // embeddings require all tokens to be processed in a single ubatch
+    // see https://github.com/ggml-org/llama.cpp/issues/12836
+    if (params.embedding && params.n_batch > params.n_ubatch) {
+        LOG_WRN("%s: embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", __func__, params.n_batch, params.n_ubatch);
+        LOG_WRN("%s: setting n_batch = n_ubatch = %d to avoid assertion failure\n", __func__, params.n_ubatch);
+        params.n_batch = params.n_ubatch;
+    }
+
+    if (params.n_parallel < 0) {
+        LOG_INF("%s: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n", __func__);
+
+        params.n_parallel = 4;
+        params.kv_unified = true;
+    }
+
+    // for consistency between server router mode and single-model mode, we set the same model name as alias
+    if (params.model_alias.empty() && !params.model.name.empty()) {
+        params.model_alias = params.model.name;
+    }
+
+    common_init();
+
+    // struct that contains llama context and inference
+    server_context ctx_server;
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
+    LOG_INF("\n");
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    LOG_INF("\n");
+
+    server_http_context ctx_http;
+    if (!ctx_http.init(params)) {
+        LOG_ERR("%s: failed to initialize HTTP server\n", __func__);
+        return 1;
+    }
+
+    //
+    // Router
+    //
+
+    // register API routes
+    server_routes routes(params, ctx_server);
+
+    bool is_router_server = params.model.path.empty();
+    std::optional<server_models_routes> models_routes{};
+    if (is_router_server) {
+        // setup server instances manager
+        try {
+            models_routes.emplace(params, argc, argv);
+        } catch (const std::exception & e) {
+            LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
+            return 1;
+        }
+
+        // proxy handlers
+        // note: routes.get_health stays the same
+        routes.get_metrics                 = models_routes->proxy_get;
+        routes.post_props                  = models_routes->proxy_post;
+        routes.get_api_show                = models_routes->proxy_get;
+        routes.post_completions            = models_routes->proxy_post;
+        routes.post_completions_oai        = models_routes->proxy_post;
+        routes.post_chat_completions       = models_routes->proxy_post;
+        routes.post_responses_oai          = models_routes->proxy_post;
+        routes.post_anthropic_messages     = models_routes->proxy_post;
+        routes.post_anthropic_count_tokens = models_routes->proxy_post;
+        routes.post_infill                 = models_routes->proxy_post;
+        routes.post_embeddings             = models_routes->proxy_post;
+        routes.post_embeddings_oai         = models_routes->proxy_post;
+        routes.post_rerank                 = models_routes->proxy_post;
+        routes.post_tokenize               = models_routes->proxy_post;
+        routes.post_detokenize             = models_routes->proxy_post;
+        routes.post_apply_template         = models_routes->proxy_post;
+        routes.get_lora_adapters           = models_routes->proxy_get;
+        routes.post_lora_adapters          = models_routes->proxy_post;
+        routes.get_slots                   = models_routes->proxy_get;
+        routes.post_slots                  = models_routes->proxy_post;
+
+        // custom routes for router
+        routes.get_props  = models_routes->get_router_props;
+        routes.get_models = models_routes->get_router_models;
+        ctx_http.post("/models/load",   ex_wrapper(models_routes->post_router_models_load));
+        ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
+    }
+
+    ctx_http.get ("/health",              ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+    ctx_http.get ("/v1/health",           ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+    ctx_http.get ("/metrics",             ex_wrapper(routes.get_metrics));
+    ctx_http.get ("/props",               ex_wrapper(routes.get_props));
+    ctx_http.post("/props",               ex_wrapper(routes.post_props));
+    ctx_http.post("/api/show",            ex_wrapper(routes.get_api_show));
+    ctx_http.get ("/models",              ex_wrapper(routes.get_models)); // public endpoint (no API key check)
+    ctx_http.get ("/v1/models",           ex_wrapper(routes.get_models)); // public endpoint (no API key check)
+    ctx_http.get ("/api/tags",            ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check)
+    ctx_http.post("/completion",          ex_wrapper(routes.post_completions)); // legacy
+    ctx_http.post("/completions",         ex_wrapper(routes.post_completions));
+    ctx_http.post("/v1/completions",      ex_wrapper(routes.post_completions_oai));
+    ctx_http.post("/chat/completions",    ex_wrapper(routes.post_chat_completions));
+    ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
+    ctx_http.post("/api/chat",            ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
+    ctx_http.post("/v1/responses",        ex_wrapper(routes.post_responses_oai));
+    ctx_http.post("/v1/messages",         ex_wrapper(routes.post_anthropic_messages)); // anthropic messages API
+    ctx_http.post("/v1/messages/count_tokens", ex_wrapper(routes.post_anthropic_count_tokens)); // anthropic token counting
+    ctx_http.post("/infill",              ex_wrapper(routes.post_infill));
+    ctx_http.post("/embedding",           ex_wrapper(routes.post_embeddings)); // legacy
+    ctx_http.post("/embeddings",          ex_wrapper(routes.post_embeddings));
+    ctx_http.post("/v1/embeddings",       ex_wrapper(routes.post_embeddings_oai));
+    ctx_http.post("/rerank",              ex_wrapper(routes.post_rerank));
+    ctx_http.post("/reranking",           ex_wrapper(routes.post_rerank));
+    ctx_http.post("/v1/rerank",           ex_wrapper(routes.post_rerank));
+    ctx_http.post("/v1/reranking",        ex_wrapper(routes.post_rerank));
+    ctx_http.post("/tokenize",            ex_wrapper(routes.post_tokenize));
+    ctx_http.post("/detokenize",          ex_wrapper(routes.post_detokenize));
+    ctx_http.post("/apply-template",      ex_wrapper(routes.post_apply_template));
+    // LoRA adapters hotswap
+    ctx_http.get ("/lora-adapters",       ex_wrapper(routes.get_lora_adapters));
+    ctx_http.post("/lora-adapters",       ex_wrapper(routes.post_lora_adapters));
+    // Save & load slots
+    ctx_http.get ("/slots",               ex_wrapper(routes.get_slots));
+    ctx_http.post("/slots/:id_slot",      ex_wrapper(routes.post_slots));
+
+    //
+    // Start the server
+    //
+
+    std::function<void()> clean_up;
+
+    if (is_router_server) {
+        LOG_INF("%s: starting router server, no model will be loaded in this process\n", __func__);
+
+        clean_up = [&models_routes]() {
+            SRV_INF("%s: cleaning up before exit...\n", __func__);
+            if (models_routes.has_value()) {
+                models_routes->models.unload_all();
+            }
+            llama_backend_free();
+        };
+
+        if (!ctx_http.start()) {
+            clean_up();
+            LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
+            return 1;
+        }
+        ctx_http.is_ready.store(true);
+
+        shutdown_handler = [&](int) {
+            ctx_http.stop();
+        };
+
+    } else {
+        // setup clean up function, to be called before exit
+        clean_up = [&ctx_http, &ctx_server]() {
+            SRV_INF("%s: cleaning up before exit...\n", __func__);
+            ctx_http.stop();
+            ctx_server.terminate();
+            llama_backend_free();
+        };
+
+        // start the HTTP server before loading the model to be able to serve /health requests
+        if (!ctx_http.start()) {
+            clean_up();
+            LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
+            return 1;
+        }
+
+        // load the model
+        LOG_INF("%s: loading model\n", __func__);
+
+        if (!ctx_server.load_model(params)) {
+            clean_up();
+            if (ctx_http.thread.joinable()) {
+                ctx_http.thread.join();
+            }
+            LOG_ERR("%s: exiting due to model loading error\n", __func__);
+            return 1;
+        }
+
+        routes.update_meta(ctx_server);
+        ctx_http.is_ready.store(true);
+
+        LOG_INF("%s: model loaded\n", __func__);
+
+        shutdown_handler = [&](int) {
+            // this will unblock start_loop()
+            ctx_server.terminate();
+        };
+    }
+
+    // TODO: refactor in common/console
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = signal_handler;
+    sigemptyset (&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+    sigaction(SIGTERM, &sigint_action, NULL);
+#elif defined (_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
+    if (is_router_server) {
+        LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
+        LOG_INF("%s: NOTE: router mode is experimental\n", __func__);
+        LOG_INF("%s:       it is not recommended to use this mode in untrusted environments\n", __func__);
+        if (ctx_http.thread.joinable()) {
+            ctx_http.thread.join(); // keep the main thread alive
+        }
+
+        // when the HTTP server stops, clean up and exit
+        clean_up();
+    } else {
+        LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
+        LOG_INF("%s: starting the main loop...\n", __func__);
+
+        // optionally, notify router server that this instance is ready
+        const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
+        std::thread monitor_thread;
+        if (router_port != nullptr) {
+            monitor_thread = server_models::setup_child_server(shutdown_handler);
+        }
+
+        // this call blocks the main thread until queue_tasks.terminate() is called
+        ctx_server.start_loop();
+
+        clean_up();
+        if (ctx_http.thread.joinable()) {
+            ctx_http.thread.join();
+        }
+        if (monitor_thread.joinable()) {
+            monitor_thread.join();
+        }
+
+        auto * ll_ctx = ctx_server.get_llama_context();
+        if (ll_ctx != nullptr) {
+            llama_memory_breakdown_print(ll_ctx);
+        }
+    }
+
+    return 0;
+}
diff --git a/llama.cpp/tools/server/tests/.gitignore b/llama.cpp/tools/server/tests/.gitignore
new file mode 100644
index 0000000..90ee7fe
--- /dev/null
+++ b/llama.cpp/tools/server/tests/.gitignore
@@ -0,0 +1,2 @@
+.venv
+tmp
diff --git a/llama.cpp/tools/server/tests/README.md b/llama.cpp/tools/server/tests/README.md
new file mode 100644
index 0000000..a60d3f8
--- /dev/null
+++ b/llama.cpp/tools/server/tests/README.md
@@ -0,0 +1,96 @@
+# Server tests
+
+Python based server tests scenario using [pytest](https://docs.pytest.org/en/stable/).
+
+Tests target GitHub workflows job runners with 4 vCPU.
+
+Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail.
+To mitigate it, you can increase values in `n_predict`, `kv_size`.
+
+### Install dependencies
+
+`pip install -r requirements.txt`
+
+### Run tests
+
+1. Build the server
+
+```shell
+cd ../../..
+cmake -B build
+cmake --build build --target llama-server
+```
+
+2. Start the test: `./tests.sh`
+
+It's possible to override some scenario steps values with environment variables:
+
+| variable                 | description                                                                                    |
+|--------------------------|------------------------------------------------------------------------------------------------|
+| `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
+| `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
+| `DEBUG`                  | to enable steps and server verbose mode `--verbose`                                       |
+| `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
+| `LLAMA_CACHE`            | by default server tests re-download models to the `tmp` subfolder. Set this to your cache (e.g. `$HOME/Library/Caches/llama.cpp` on Mac or `$HOME/.cache/llama.cpp` on Unix) to avoid this |
+
+To run slow tests (will download many models, make sure to set `LLAMA_CACHE` if needed):
+
+```shell
+SLOW_TESTS=1 ./tests.sh
+```
+
+To run with stdout/stderr display in real time (verbose output, but useful for debugging):
+
+```shell
+DEBUG=1 ./tests.sh -s -v -x
+```
+
+To run all the tests in a file:
+
+```shell
+./tests.sh unit/test_chat_completion.py -v -x
+```
+
+To run a single test:
+
+```shell
+./tests.sh unit/test_chat_completion.py::test_invalid_chat_completion_req
+```
+
+Hint: You can compile and run test in single command, useful for local developement:
+
+```shell
+cmake --build build -j --target llama-server && ./tools/server/tests/tests.sh
+```
+
+To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)
+
+### Debugging external llama-server
+It can sometimes be useful to run the server in a debugger when invesigating test
+failures. To do this, the environment variable `DEBUG_EXTERNAL=1` can be set
+which will cause the test to skip starting a llama-server itself. Instead, the
+server can be started in a debugger.
+
+Example using `gdb`:
+```console
+$ gdb --args ../../../build/bin/llama-server \
+    --host 127.0.0.1 --port 8080 \
+    --temp 0.8 --seed 42 \
+    --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf \
+    --batch-size 32 --no-slots --alias tinyllama-2 --ctx-size 512 \
+    --parallel 2 --n-predict 64
+```
+And a break point can be set in before running:
+```console
+(gdb) br server.cpp:4604
+(gdb) r
+main: server is listening on http://127.0.0.1:8080 - starting the main loop
+srv  update_slots: all slots are idle
+```
+
+And then the test in question can be run in another terminal:
+```console
+(venv) $ env DEBUG_EXTERNAL=1 ./tests.sh unit/test_chat_completion.py -v -x
+```
+And this should trigger the breakpoint and allow inspection of the server state
+in the debugger terminal.
diff --git a/llama.cpp/tools/server/tests/conftest.py b/llama.cpp/tools/server/tests/conftest.py
new file mode 100644
index 0000000..c7ed775
--- /dev/null
+++ b/llama.cpp/tools/server/tests/conftest.py
@@ -0,0 +1,21 @@
+import pytest
+from utils import *
+
+
+# ref: https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test
+@pytest.fixture(autouse=True)
+def stop_server_after_each_test():
+    # do nothing before each test
+    yield
+    # stop all servers after each test
+    instances = set(
+        server_instances
+    )  # copy the set to prevent 'Set changed size during iteration'
+    for server in instances:
+        server.stop()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def do_something():
+    # this will be run once per test session, before any tests
+    ServerPreset.load_all()
diff --git a/llama.cpp/tools/server/tests/pytest.ini b/llama.cpp/tools/server/tests/pytest.ini
new file mode 100644
index 0000000..6df308d
--- /dev/null
+++ b/llama.cpp/tools/server/tests/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    serial
diff --git a/llama.cpp/tools/server/tests/requirements.txt b/llama.cpp/tools/server/tests/requirements.txt
new file mode 100644
index 0000000..ca79d02
--- /dev/null
+++ b/llama.cpp/tools/server/tests/requirements.txt
@@ -0,0 +1,8 @@
+aiohttp~=3.9.3
+pytest~=8.3.3
+huggingface_hub>=0.34.0,<1.0
+numpy~=1.26.4
+openai~=2.14.0
+prometheus-client~=0.20.0
+requests~=2.32.3
+wget~=3.2
diff --git a/llama.cpp/tools/server/tests/tests.sh b/llama.cpp/tools/server/tests/tests.sh
new file mode 100755
index 0000000..709b584
--- /dev/null
+++ b/llama.cpp/tools/server/tests/tests.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+# make sure we are in the right directory
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
+
+set -eu
+
+if [[ "${SLOW_TESTS:-0}" == 1 ]]; then
+    # Slow tests for tool calls need quite a few models ahead of time to avoid timing out.
+    python $SCRIPT_DIR/../../../scripts/fetch_server_test_models.py
+fi
+
+if [ $# -lt 1 ]
+then
+    if [[ "${SLOW_TESTS:-0}" == 1 ]]; then
+        pytest -v -x
+    else
+        pytest -v -x -m "not slow"
+    fi
+else
+    pytest "$@"
+fi
diff --git a/llama.cpp/tools/server/tests/unit/test_basic.py b/llama.cpp/tools/server/tests/unit/test_basic.py
new file mode 100644
index 0000000..3405be3
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_basic.py
@@ -0,0 +1,96 @@
+import pytest
+import requests
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+def test_server_start_simple():
+    global server
+    server.start()
+    res = server.make_request("GET", "/health")
+    assert res.status_code == 200
+
+
+def test_server_props():
+    global server
+    server.start()
+    res = server.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert ".gguf" in res.body["model_path"]
+    assert res.body["total_slots"] == server.n_slots
+    default_val = res.body["default_generation_settings"]
+    assert server.n_ctx is not None and server.n_slots is not None
+    assert default_val["n_ctx"] == server.n_ctx / server.n_slots
+    assert default_val["params"]["seed"] == server.seed
+
+
+def test_server_models():
+    global server
+    server.start()
+    res = server.make_request("GET", "/models")
+    assert res.status_code == 200
+    assert len(res.body["data"]) == 1
+    assert res.body["data"][0]["id"] == server.model_alias
+
+
+def test_server_slots():
+    global server
+
+    # without slots endpoint enabled, this should return error
+    server.server_slots = False
+    server.start()
+    res = server.make_request("GET", "/slots")
+    assert res.status_code == 501 # ERROR_TYPE_NOT_SUPPORTED
+    assert "error" in res.body
+    server.stop()
+
+    # with slots endpoint enabled, this should return slots info
+    server.server_slots = True
+    server.n_slots = 2
+    server.start()
+    res = server.make_request("GET", "/slots")
+    assert res.status_code == 200
+    assert len(res.body) == server.n_slots
+    assert server.n_ctx is not None and server.n_slots is not None
+    assert res.body[0]["n_ctx"] == server.n_ctx / server.n_slots
+    assert "params" not in res.body[0]
+
+
+def test_load_split_model():
+    global server
+    server.offline = False
+    server.model_hf_repo = "ggml-org/models"
+    server.model_hf_file = "tinyllamas/split/stories15M-q8_0-00001-of-00003.gguf"
+    server.model_alias = "tinyllama-split"
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 16,
+        "prompt": "Hello",
+        "temperature": 0.0,
+    })
+    assert res.status_code == 200
+    assert match_regex("(little|girl)+", res.body["content"])
+
+
+def test_no_webui():
+    global server
+    # default: webui enabled
+    server.start()
+    url = f"http://{server.server_host}:{server.server_port}"
+    res = requests.get(url)
+    assert res.status_code == 200
+    assert "<!doctype html>" in res.text
+    server.stop()
+
+    # with --no-webui
+    server.no_webui = True
+    server.start()
+    res = requests.get(url)
+    assert res.status_code == 404
diff --git a/llama.cpp/tools/server/tests/unit/test_chat_completion.py b/llama.cpp/tools/server/tests/unit/test_chat_completion.py
new file mode 100644
index 0000000..d56a930
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_chat_completion.py
@@ -0,0 +1,512 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server: ServerProcess
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+@pytest.mark.parametrize(
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
+    [
+        (None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", False, None),
+        (None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", True, None),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", False, None),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True,  None),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, 'chatml'),
+        (None, "Book", "What is the best book", 8, "^ blue",                    23, 8, "length", True, "This is not a chat template, it is"),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 128, "length", False, None),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 128, "length", True, None),
+        (None, "Book", [{"type": "text", "text": "What is"}, {"type": "text", "text": "the best book"}], 8, "Whillicter", 79, 8, "length", False, None),
+        (None, "Book", [{"type": "text", "text": "What is"}, {"type": "text", "text": "the best book"}], 8, "Whillicter", 79, 8, "length", True, None),
+    ]
+)
+def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template):
+    global server
+    server.jinja = jinja
+    server.chat_template = chat_template
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "model": model,
+        "max_tokens": max_tokens,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+    })
+    assert res.status_code == 200
+    assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
+    assert res.body["system_fingerprint"].startswith("b")
+    # we no longer reflect back the model name, see https://github.com/ggml-org/llama.cpp/pull/17668
+    # assert res.body["model"] == model if model is not None else server.model_alias
+    assert res.body["usage"]["prompt_tokens"] == n_prompt
+    assert res.body["usage"]["completion_tokens"] == n_predicted
+    choice = res.body["choices"][0]
+    assert "assistant" == choice["message"]["role"]
+    assert match_regex(re_content, choice["message"]["content"]), f'Expected {re_content}, got {choice["message"]["content"]}'
+    assert choice["finish_reason"] == finish_reason
+
+
+@pytest.mark.parametrize(
+    "system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
+    [
+        ("Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
+        ("You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 128, "length"),
+    ]
+)
+def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
+    global server
+    server.model_alias = "llama-test-model"
+    server.start()
+    res = server.make_stream_request("POST", "/chat/completions", data={
+        "max_tokens": max_tokens,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+        "stream": True,
+    })
+    content = ""
+    last_cmpl_id = None
+    for i, data in enumerate(res):
+        if data["choices"]:
+            choice = data["choices"][0]
+            if i == 0:
+                # Check first role message for stream=True
+                assert choice["delta"]["content"] is None
+                assert choice["delta"]["role"] == "assistant"
+            else:
+                assert "role" not in choice["delta"]
+            assert data["system_fingerprint"].startswith("b")
+            assert data["model"] == "llama-test-model"
+            if last_cmpl_id is None:
+                last_cmpl_id = data["id"]
+            assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
+            if choice["finish_reason"] in ["stop", "length"]:
+                assert "content" not in choice["delta"]
+                assert match_regex(re_content, content)
+                assert choice["finish_reason"] == finish_reason
+            else:
+                assert choice["finish_reason"] is None
+                content += choice["delta"]["content"] or ''
+        else:
+            assert data["usage"]["prompt_tokens"] == n_prompt
+            assert data["usage"]["completion_tokens"] == n_predicted
+
+
+def test_chat_completion_with_openai_library():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=8,
+        seed=42,
+        temperature=0.8,
+    )
+    assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
+    assert res.choices[0].finish_reason == "length"
+    assert res.choices[0].message.content is not None
+    assert match_regex("(Suddenly)+", res.choices[0].message.content)
+
+
+def test_chat_template():
+    global server
+    server.chat_template = "llama3"
+    server.debug = True  # to get the "__verbose" object in the response
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 8,
+        "messages": [
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ]
+    })
+    assert res.status_code == 200
+    assert "__verbose" in res.body
+    assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+
+
+@pytest.mark.parametrize("prefill,re_prefill", [
+    ("Whill", "Whill"),
+    ([{"type": "text", "text": "Wh"}, {"type": "text", "text": "ill"}], "Whill"),
+])
+def test_chat_template_assistant_prefill(prefill, re_prefill):
+    global server
+    server.chat_template = "llama3"
+    server.debug = True  # to get the "__verbose" object in the response
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 8,
+        "messages": [
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+            {"role": "assistant", "content": prefill},
+        ]
+    })
+    assert res.status_code == 200
+    assert "__verbose" in res.body
+    assert res.body["__verbose"]["prompt"] == f"<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{re_prefill}"
+
+
+def test_apply_chat_template():
+    global server
+    server.chat_template = "command-r"
+    server.start()
+    res = server.make_request("POST", "/apply-template", data={
+        "messages": [
+            {"role": "system", "content": "You are a test."},
+            {"role": "user", "content":"Hi there"},
+        ]
+    })
+    assert res.status_code == 200
+    assert "prompt" in res.body
+    assert res.body["prompt"] == "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a test.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+
+
+@pytest.mark.parametrize("response_format,n_predicted,re_content", [
+    ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
+    ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
+    ({"type": "json_schema", "json_schema": {"schema": {"const": "foooooo"}}}, 10, "\"foooooo\""),
+    ({"type": "json_object"}, 10, "(\\{|John)+"),
+    ({"type": "sound"}, 0, None),
+    # invalid response format (expected to fail)
+    ({"type": "json_object", "schema": 123}, 0, None),
+    ({"type": "json_object", "schema": {"type": 123}}, 0, None),
+    ({"type": "json_object", "schema": {"type": "hiccup"}}, 0, None),
+])
+def test_completion_with_response_format(response_format: dict, n_predicted: int, re_content: str | None):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": n_predicted,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "Write an example"},
+        ],
+        "response_format": response_format,
+    })
+    if re_content is not None:
+        assert res.status_code == 200
+        choice = res.body["choices"][0]
+        assert match_regex(re_content, choice["message"]["content"])
+    else:
+        assert res.status_code == 400
+        assert "error" in res.body
+
+
+@pytest.mark.parametrize("jinja,json_schema,n_predicted,re_content", [
+    (False, {"const": "42"}, 6, "\"42\""),
+    (True, {"const": "42"}, 6, "\"42\""),
+])
+def test_completion_with_json_schema(jinja: bool, json_schema: dict, n_predicted: int, re_content: str):
+    global server
+    server.jinja = jinja
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": n_predicted,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "Write an example"},
+        ],
+        "json_schema": json_schema,
+    })
+    assert res.status_code == 200, f'Expected 200, got {res.status_code}'
+    choice = res.body["choices"][0]
+    assert match_regex(re_content, choice["message"]["content"]), f'Expected {re_content}, got {choice["message"]["content"]}'
+
+
+@pytest.mark.parametrize("jinja,grammar,n_predicted,re_content", [
+    (False, 'root ::= "a"{5,5}', 6, "a{5,5}"),
+    (True, 'root ::= "a"{5,5}', 6, "a{5,5}"),
+])
+def test_completion_with_grammar(jinja: bool, grammar: str, n_predicted: int, re_content: str):
+    global server
+    server.jinja = jinja
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": n_predicted,
+        "messages": [
+            {"role": "user", "content": "Does not matter what I say, does it?"},
+        ],
+        "grammar": grammar,
+    })
+    assert res.status_code == 200, res.body
+    choice = res.body["choices"][0]
+    assert match_regex(re_content, choice["message"]["content"]), choice["message"]["content"]
+
+
+@pytest.mark.parametrize("messages", [
+    None,
+    "string",
+    [123],
+    [{}],
+    [{"role": 123}],
+    [{"role": "system", "content": 123}],
+    # [{"content": "hello"}], # TODO: should not be a valid case
+    [{"role": "system", "content": "test"}, {}],
+    [{"role": "user", "content": "test"}, {"role": "assistant", "content": "test"}, {"role": "assistant", "content": "test"}],
+])
+def test_invalid_chat_completion_req(messages):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "messages": messages,
+    })
+    assert res.status_code == 400 or res.status_code == 500
+    assert "error" in res.body
+
+
+def test_chat_completion_with_timings_per_token():
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/chat/completions", data={
+        "max_tokens": 10,
+        "messages": [{"role": "user", "content": "test"}],
+        "stream": True,
+        "stream_options": {"include_usage": True},
+        "timings_per_token": True,
+    })
+    stats_received = False
+    for i, data in enumerate(res):
+        if i == 0:
+            # Check first role message for stream=True
+            assert data["choices"][0]["delta"]["content"] is None
+            assert data["choices"][0]["delta"]["role"] == "assistant"
+            assert "timings" not in data, f'First event should not have timings: {data}'
+        else:
+            if data["choices"]:
+                assert "role" not in data["choices"][0]["delta"]
+            else:
+                assert "timings" in data
+                assert "prompt_per_second" in data["timings"]
+                assert "predicted_per_second" in data["timings"]
+                assert "predicted_n" in data["timings"]
+                assert data["timings"]["predicted_n"] <= 10
+                stats_received = True
+    assert stats_received
+
+
+def test_logprobs():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        temperature=0.0,
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=5,
+        logprobs=True,
+        top_logprobs=10,
+    )
+    output_text = res.choices[0].message.content
+    aggregated_text = ''
+    assert res.choices[0].logprobs is not None
+    assert res.choices[0].logprobs.content is not None
+    for token in res.choices[0].logprobs.content:
+        aggregated_text += token.token
+        assert token.logprob <= 0.0
+        assert token.bytes is not None
+        assert len(token.top_logprobs) > 0
+    assert aggregated_text == output_text
+
+
+def test_logprobs_stream():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        temperature=0.0,
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=5,
+        logprobs=True,
+        top_logprobs=10,
+        stream=True,
+    )
+    output_text = ''
+    aggregated_text = ''
+    for i, data in enumerate(res):
+        if data.choices:
+            choice = data.choices[0]
+            if i == 0:
+                # Check first role message for stream=True
+                assert choice.delta.content is None
+                assert choice.delta.role == "assistant"
+            else:
+                assert choice.delta.role is None
+                if choice.finish_reason is None:
+                    if choice.delta.content:
+                        output_text += choice.delta.content
+                    assert choice.logprobs is not None
+                    assert choice.logprobs.content is not None
+                    for token in choice.logprobs.content:
+                        aggregated_text += token.token
+                        assert token.logprob <= 0.0
+                        assert token.bytes is not None
+                        assert token.top_logprobs is not None
+                        assert len(token.top_logprobs) > 0
+    assert aggregated_text == output_text
+
+
+def test_logit_bias():
+    global server
+    server.start()
+
+    exclude = ["i", "I", "the", "The", "to", "a", "an", "be", "is", "was", "but", "But", "and", "And", "so", "So", "you", "You", "he", "He", "she", "She", "we", "We", "they", "They", "it", "It", "his", "His", "her", "Her", "book", "Book"]
+
+    res = server.make_request("POST", "/tokenize", data={
+        "content": " " + " ".join(exclude) + " ",
+    })
+    assert res.status_code == 200
+    tokens = res.body["tokens"]
+    logit_bias = {tok: -100 for tok in tokens}
+
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        temperature=0.0,
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=64,
+        logit_bias=logit_bias
+    )
+    output_text = res.choices[0].message.content
+    assert output_text
+    assert all(output_text.find(" " + tok + " ") == -1 for tok in exclude)
+
+def test_context_size_exceeded():
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "messages": [
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ] * 100, # make the prompt too long
+    })
+    assert res.status_code == 400
+    assert "error" in res.body
+    assert res.body["error"]["type"] == "exceed_context_size_error"
+    assert res.body["error"]["n_prompt_tokens"] > 0
+    assert server.n_ctx is not None
+    assert server.n_slots is not None
+    assert res.body["error"]["n_ctx"] == server.n_ctx // server.n_slots
+
+
+def test_context_size_exceeded_stream():
+    global server
+    server.start()
+    try:
+        for _ in server.make_stream_request("POST", "/chat/completions", data={
+            "messages": [
+                {"role": "system", "content": "Book"},
+                {"role": "user", "content": "What is the best book"},
+            ] * 100, # make the prompt too long
+            "stream": True}):
+                pass
+        assert False, "Should have failed"
+    except ServerError as e:
+        assert e.code == 400
+        assert "error" in e.body
+        assert e.body["error"]["type"] == "exceed_context_size_error"
+        assert e.body["error"]["n_prompt_tokens"] > 0
+        assert server.n_ctx is not None
+        assert server.n_slots is not None
+        assert e.body["error"]["n_ctx"] == server.n_ctx // server.n_slots
+
+
+@pytest.mark.parametrize(
+    "n_batch,batch_count,reuse_cache",
+    [
+        (64, 4, False),
+        (64, 2, True),
+    ]
+)
+def test_return_progress(n_batch, batch_count, reuse_cache):
+    global server
+    server.n_batch = n_batch
+    server.n_ctx = 256
+    server.n_slots = 1
+    server.start()
+    def make_cmpl_request():
+        return server.make_stream_request("POST", "/chat/completions", data={
+            "max_tokens": 10,
+            "messages": [
+                {"role": "user", "content": "This is a test" * 10},
+            ],
+            "stream": True,
+            "return_progress": True,
+        })
+    if reuse_cache:
+        # make a first request to populate the cache
+        res0 = make_cmpl_request()
+        for _ in res0:
+            pass # discard the output
+
+    res = make_cmpl_request()
+    last_progress = None
+    total_batch_count = 0
+
+    for data in res:
+        cur_progress = data.get("prompt_progress", None)
+        if cur_progress is None:
+            continue
+        if total_batch_count == 0:
+            # first progress report must have n_cache == n_processed
+            assert cur_progress["total"] > 0
+            assert cur_progress["cache"] == cur_progress["processed"]
+            if reuse_cache:
+                # when reusing cache, we expect some cached tokens
+                assert cur_progress["cache"] > 0
+        if last_progress is not None:
+            assert cur_progress["total"] == last_progress["total"]
+            assert cur_progress["cache"] == last_progress["cache"]
+            assert cur_progress["processed"] > last_progress["processed"]
+        total_batch_count += 1
+        last_progress = cur_progress
+
+    # last progress should indicate completion (all tokens processed)
+    assert last_progress is not None
+    assert last_progress["total"] > 0
+    assert last_progress["processed"] == last_progress["total"]
+    assert total_batch_count == batch_count
+
+
+def test_chat_completions_multiple_choices():
+    global server
+    server.start()
+    # make sure cache can be reused across multiple choices and multiple requests
+    # ref: https://github.com/ggml-org/llama.cpp/pull/18663
+    for _ in range(2):
+        res = server.make_request("POST", "/chat/completions", data={
+            "max_tokens": 8,
+            "n": 2,
+            "messages": [
+                {"role": "system", "content": "Book"},
+                {"role": "user", "content": "What is the best book"},
+            ],
+            # test forcing the same slot to be used
+            # the scheduler should not be locked up in this case
+            "id_slot": 0,
+        })
+        assert res.status_code == 200
+        assert len(res.body["choices"]) == 2
+        for choice in res.body["choices"]:
+            assert "assistant" == choice["message"]["role"]
+            assert choice["finish_reason"] == "length"
diff --git a/llama.cpp/tools/server/tests/unit/test_compat_anthropic.py b/llama.cpp/tools/server/tests/unit/test_compat_anthropic.py
new file mode 100644
index 0000000..e16e023
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_compat_anthropic.py
@@ -0,0 +1,896 @@
+#!/usr/bin/env python3
+import pytest
+import base64
+import requests
+
+from utils import *
+
+server: ServerProcess
+
+
+def get_test_image_base64() -> str:
+    """Get a test image in base64 format"""
+    # Use the same test image as test_vision_api.py
+    IMG_URL = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/11_truck.png"
+    response = requests.get(IMG_URL)
+    response.raise_for_status()
+    return base64.b64encode(response.content).decode("utf-8")
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.model_alias = "tinyllama-2-anthropic"
+    server.server_port = 8082
+    server.n_slots = 1
+    server.n_ctx = 8192
+    server.n_batch = 2048
+
+
+@pytest.fixture
+def vision_server():
+    """Separate fixture for vision tests that require multimodal support"""
+    global server
+    server = ServerPreset.tinygemma3()
+    server.offline = False  # Allow downloading the model
+    server.model_alias = "tinygemma3-anthropic"
+    server.server_port = 8083  # Different port to avoid conflicts
+    server.n_slots = 1
+    return server
+
+
+# Basic message tests
+
+def test_anthropic_messages_basic():
+    """Test basic Anthropic messages endpoint"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 50,
+        "messages": [
+            {"role": "user", "content": "Say hello"}
+        ]
+    })
+
+    assert res.status_code == 200, f"Expected 200, got {res.status_code}"
+    assert res.body["type"] == "message", f"Expected type 'message', got {res.body.get('type')}"
+    assert res.body["role"] == "assistant", f"Expected role 'assistant', got {res.body.get('role')}"
+    assert "content" in res.body, "Missing 'content' field"
+    assert isinstance(res.body["content"], list), "Content should be an array"
+    assert len(res.body["content"]) > 0, "Content array should not be empty"
+    assert res.body["content"][0]["type"] == "text", "First content block should be text"
+    assert "text" in res.body["content"][0], "Text content block missing 'text' field"
+    assert res.body["stop_reason"] in ["end_turn", "max_tokens"], f"Invalid stop_reason: {res.body.get('stop_reason')}"
+    assert "usage" in res.body, "Missing 'usage' field"
+    assert "input_tokens" in res.body["usage"], "Missing usage.input_tokens"
+    assert "output_tokens" in res.body["usage"], "Missing usage.output_tokens"
+    assert isinstance(res.body["usage"]["input_tokens"], int), "input_tokens should be integer"
+    assert isinstance(res.body["usage"]["output_tokens"], int), "output_tokens should be integer"
+    assert res.body["usage"]["output_tokens"] > 0, "Should have generated some tokens"
+    # Anthropic API should NOT include timings
+    assert "timings" not in res.body, "Anthropic API should not include timings field"
+
+
+def test_anthropic_messages_with_system():
+    """Test messages with system prompt"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 50,
+        "system": "You are a helpful assistant.",
+        "messages": [
+            {"role": "user", "content": "Hello"}
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+    assert len(res.body["content"]) > 0
+
+
+def test_anthropic_messages_multipart_content():
+    """Test messages with multipart content blocks"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 50,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is"},
+                    {"type": "text", "text": " the answer?"}
+                ]
+            }
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+
+
+def test_anthropic_messages_conversation():
+    """Test multi-turn conversation"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 50,
+        "messages": [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi there!"},
+            {"role": "user", "content": "How are you?"}
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+
+
+# Streaming tests
+
+def test_anthropic_messages_streaming():
+    """Test streaming messages"""
+    server.start()
+
+    res = server.make_stream_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 30,
+        "messages": [
+            {"role": "user", "content": "Say hello"}
+        ],
+        "stream": True
+    })
+
+    events = []
+    for data in res:
+        # Each event should have type and other fields
+        assert "type" in data, f"Missing 'type' in event: {data}"
+        events.append(data)
+
+    # Verify event sequence
+    event_types = [e["type"] for e in events]
+    assert "message_start" in event_types, "Missing message_start event"
+    assert "content_block_start" in event_types, "Missing content_block_start event"
+    assert "content_block_delta" in event_types, "Missing content_block_delta event"
+    assert "content_block_stop" in event_types, "Missing content_block_stop event"
+    assert "message_delta" in event_types, "Missing message_delta event"
+    assert "message_stop" in event_types, "Missing message_stop event"
+
+    # Check message_start structure
+    message_start = next(e for e in events if e["type"] == "message_start")
+    assert "message" in message_start, "message_start missing 'message' field"
+    assert message_start["message"]["type"] == "message"
+    assert message_start["message"]["role"] == "assistant"
+    assert message_start["message"]["content"] == []
+    assert "usage" in message_start["message"]
+    assert message_start["message"]["usage"]["input_tokens"] > 0
+
+    # Check content_block_start
+    block_start = next(e for e in events if e["type"] == "content_block_start")
+    assert "index" in block_start, "content_block_start missing 'index'"
+    assert block_start["index"] == 0, "First content block should be at index 0"
+    assert "content_block" in block_start
+    assert block_start["content_block"]["type"] == "text"
+
+    # Check content_block_delta
+    deltas = [e for e in events if e["type"] == "content_block_delta"]
+    assert len(deltas) > 0, "Should have at least one content_block_delta"
+    for delta in deltas:
+        assert "index" in delta
+        assert "delta" in delta
+        assert delta["delta"]["type"] == "text_delta"
+        assert "text" in delta["delta"]
+
+    # Check content_block_stop
+    block_stop = next(e for e in events if e["type"] == "content_block_stop")
+    assert "index" in block_stop
+    assert block_stop["index"] == 0
+
+    # Check message_delta
+    message_delta = next(e for e in events if e["type"] == "message_delta")
+    assert "delta" in message_delta
+    assert "stop_reason" in message_delta["delta"]
+    assert message_delta["delta"]["stop_reason"] in ["end_turn", "max_tokens"]
+    assert "usage" in message_delta
+    assert message_delta["usage"]["output_tokens"] > 0
+
+    # Check message_stop
+    message_stop = next(e for e in events if e["type"] == "message_stop")
+    # message_stop should NOT have timings for Anthropic API
+    assert "timings" not in message_stop, "Anthropic streaming should not include timings"
+
+
+# Token counting tests
+
+def test_anthropic_count_tokens():
+    """Test token counting endpoint"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages/count_tokens", data={
+        "model": "test",
+        "messages": [
+            {"role": "user", "content": "Hello world"}
+        ]
+    })
+
+    assert res.status_code == 200
+    assert "input_tokens" in res.body
+    assert isinstance(res.body["input_tokens"], int)
+    assert res.body["input_tokens"] > 0
+    # Should only have input_tokens, no other fields
+    assert "output_tokens" not in res.body
+
+
+def test_anthropic_count_tokens_with_system():
+    """Test token counting with system prompt"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages/count_tokens", data={
+        "model": "test",
+        "system": "You are a helpful assistant.",
+        "messages": [
+            {"role": "user", "content": "Hello"}
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["input_tokens"] > 0
+
+
+def test_anthropic_count_tokens_no_max_tokens():
+    """Test that count_tokens doesn't require max_tokens"""
+    server.start()
+
+    # max_tokens is NOT required for count_tokens
+    res = server.make_request("POST", "/v1/messages/count_tokens", data={
+        "model": "test",
+        "messages": [
+            {"role": "user", "content": "Hello"}
+        ]
+    })
+
+    assert res.status_code == 200
+    assert "input_tokens" in res.body
+
+
+# Tool use tests
+
+def test_anthropic_tool_use_basic():
+    """Test basic tool use"""
+    server.jinja = True
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 200,
+        "tools": [{
+            "name": "get_weather",
+            "description": "Get the current weather in a location",
+            "input_schema": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "City name"
+                    }
+                },
+                "required": ["location"]
+            }
+        }],
+        "messages": [
+            {"role": "user", "content": "What's the weather in Paris?"}
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+    assert len(res.body["content"]) > 0
+
+    # Check if model used the tool (it might not always, depending on the model)
+    content_types = [block.get("type") for block in res.body["content"]]
+
+    if "tool_use" in content_types:
+        # Model used the tool
+        assert res.body["stop_reason"] == "tool_use"
+
+        # Find the tool_use block
+        tool_block = next(b for b in res.body["content"] if b.get("type") == "tool_use")
+        assert "id" in tool_block
+        assert "name" in tool_block
+        assert tool_block["name"] == "get_weather"
+        assert "input" in tool_block
+        assert isinstance(tool_block["input"], dict)
+
+
+def test_anthropic_tool_result():
+    """Test sending tool results back
+
+    This test verifies that tool_result blocks are properly converted to
+    role="tool" messages internally. Without proper conversion, this would
+    fail with a 500 error: "unsupported content[].type" because tool_result
+    blocks would remain in the user message content array.
+    """
+    server.jinja = True
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 100,
+        "messages": [
+            {"role": "user", "content": "What's the weather?"},
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "tool_use",
+                        "id": "test123",
+                        "name": "get_weather",
+                        "input": {"location": "Paris"}
+                    }
+                ]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "tool_result",
+                        "tool_use_id": "test123",
+                        "content": "The weather is sunny, 25°C"
+                    }
+                ]
+            }
+        ]
+    })
+
+    # This would be 500 with the old bug where tool_result blocks weren't converted
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+    # Model should respond to the tool result
+    assert len(res.body["content"]) > 0
+    assert res.body["content"][0]["type"] == "text"
+
+
+def test_anthropic_tool_result_with_text():
+    """Test tool result mixed with text content
+
+    This tests the edge case where a user message contains both text and
+    tool_result blocks. The server must properly split these into separate
+    messages: a user message with text, followed by tool messages.
+    Without proper handling, this would fail with 500: "unsupported content[].type"
+    """
+    server.jinja = True
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 100,
+        "messages": [
+            {"role": "user", "content": "What's the weather?"},
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "tool_use",
+                        "id": "tool_1",
+                        "name": "get_weather",
+                        "input": {"location": "Paris"}
+                    }
+                ]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Here are the results:"},
+                    {
+                        "type": "tool_result",
+                        "tool_use_id": "tool_1",
+                        "content": "Sunny, 25°C"
+                    }
+                ]
+            }
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+    assert len(res.body["content"]) > 0
+
+
+def test_anthropic_tool_result_error():
+    """Test tool result with error flag"""
+    server.jinja = True
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 100,
+        "messages": [
+            {"role": "user", "content": "Get the weather"},
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "tool_use",
+                        "id": "test123",
+                        "name": "get_weather",
+                        "input": {"location": "InvalidCity"}
+                    }
+                ]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "tool_result",
+                        "tool_use_id": "test123",
+                        "is_error": True,
+                        "content": "City not found"
+                    }
+                ]
+            }
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+
+
+def test_anthropic_tool_streaming():
+    """Test streaming with tool use"""
+    server.jinja = True
+    server.start()
+
+    res = server.make_stream_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 200,
+        "stream": True,
+        "tools": [{
+            "name": "calculator",
+            "description": "Calculate math",
+            "input_schema": {
+                "type": "object",
+                "properties": {
+                    "expression": {"type": "string"}
+                },
+                "required": ["expression"]
+            }
+        }],
+        "messages": [
+            {"role": "user", "content": "Calculate 2+2"}
+        ]
+    })
+
+    events = []
+    for data in res:
+        events.append(data)
+
+    event_types = [e["type"] for e in events]
+
+    # Should have basic events
+    assert "message_start" in event_types
+    assert "message_stop" in event_types
+
+    # If tool was used, check for proper tool streaming
+    if any(e.get("type") == "content_block_start" and
+           e.get("content_block", {}).get("type") == "tool_use"
+           for e in events):
+        # Find tool use block start
+        tool_starts = [e for e in events if
+                      e.get("type") == "content_block_start" and
+                      e.get("content_block", {}).get("type") == "tool_use"]
+
+        assert len(tool_starts) > 0, "Should have tool_use content_block_start"
+
+        # Check index is correct (should be 0 if no text, 1 if there's text)
+        tool_start = tool_starts[0]
+        assert "index" in tool_start
+        assert tool_start["content_block"]["type"] == "tool_use"
+        assert "name" in tool_start["content_block"]
+
+
+# Vision/multimodal tests
+
+def test_anthropic_vision_format_accepted():
+    """Test that Anthropic vision format is accepted (format validation only)"""
+    server.start()
+
+    # Small 1x1 red PNG image in base64
+    red_pixel_png = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg=="
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 10,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": "image/png",
+                            "data": red_pixel_png
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "What is this?"
+                    }
+                ]
+            }
+        ]
+    })
+
+    # Server accepts the format but tinyllama doesn't support images
+    # So it should return 500 with clear error message about missing mmproj
+    assert res.status_code == 500
+    assert "image input is not supported" in res.body.get("error", {}).get("message", "").lower()
+
+
+def test_anthropic_vision_base64_with_multimodal_model(vision_server):
+    """Test vision with base64 image using Anthropic format with multimodal model"""
+    global server
+    server = vision_server
+    server.start()
+
+    # Get test image in base64 format
+    image_base64 = get_test_image_base64()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 10,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": "image/png",
+                            "data": image_base64
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "What is this:\n"
+                    }
+                ]
+            }
+        ]
+    })
+
+    assert res.status_code == 200, f"Expected 200, got {res.status_code}: {res.body}"
+    assert res.body["type"] == "message"
+    assert len(res.body["content"]) > 0
+    assert res.body["content"][0]["type"] == "text"
+    # The model should generate some response about the image
+    assert len(res.body["content"][0]["text"]) > 0
+
+
+# Parameter tests
+
+def test_anthropic_stop_sequences():
+    """Test stop_sequences parameter"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 100,
+        "stop_sequences": ["\n", "END"],
+        "messages": [
+            {"role": "user", "content": "Count to 10"}
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+
+
+def test_anthropic_temperature():
+    """Test temperature parameter"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 50,
+        "temperature": 0.5,
+        "messages": [
+            {"role": "user", "content": "Hello"}
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+
+
+def test_anthropic_top_p():
+    """Test top_p parameter"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 50,
+        "top_p": 0.9,
+        "messages": [
+            {"role": "user", "content": "Hello"}
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+
+
+def test_anthropic_top_k():
+    """Test top_k parameter (llama.cpp specific)"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 50,
+        "top_k": 40,
+        "messages": [
+            {"role": "user", "content": "Hello"}
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+
+
+# Error handling tests
+
+def test_anthropic_missing_messages():
+    """Test error when messages are missing"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 50
+        # missing "messages" field
+    })
+
+    # Should return an error (400 or 500)
+    assert res.status_code >= 400
+
+
+def test_anthropic_empty_messages():
+    """Test permissive handling of empty messages array"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 50,
+        "messages": []
+    })
+
+    # Server is permissive and accepts empty messages (provides defaults)
+    # This matches the permissive validation design choice
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+
+
+# Content block index tests
+
+def test_anthropic_streaming_content_block_indices():
+    """Test that content block indices are correct in streaming"""
+    server.jinja = True
+    server.start()
+
+    # Request that might produce both text and tool use
+    res = server.make_stream_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 400,
+        "stream": True,
+        "tools": [{
+            "name": "test_tool",
+            "description": "A test tool",
+            "input_schema": {
+                "type": "object",
+                "properties": {
+                    "param": {"type": "string"}
+                },
+                "required": ["param"]
+            }
+        }],
+        "messages": [
+            {"role": "user", "content": "Use the test tool"}
+        ]
+    })
+
+    events = []
+    for data in res:
+        events.append(data)
+
+    # Check content_block_start events have sequential indices
+    block_starts = [e for e in events if e.get("type") == "content_block_start"]
+    if len(block_starts) > 1:
+        # If there are multiple blocks, indices should be sequential
+        indices = [e["index"] for e in block_starts]
+        expected_indices = list(range(len(block_starts)))
+        assert indices == expected_indices, f"Expected indices {expected_indices}, got {indices}"
+
+    # Check content_block_stop events match the starts
+    block_stops = [e for e in events if e.get("type") == "content_block_stop"]
+    start_indices = set(e["index"] for e in block_starts)
+    stop_indices = set(e["index"] for e in block_stops)
+    assert start_indices == stop_indices, "content_block_stop indices should match content_block_start indices"
+
+
+# Extended features tests
+
+def test_anthropic_thinking():
+    """Test extended thinking parameter"""
+    server.jinja = True
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 100,
+        "thinking": {
+            "type": "enabled",
+            "budget_tokens": 50
+        },
+        "messages": [
+            {"role": "user", "content": "What is 2+2?"}
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+
+
+def test_anthropic_metadata():
+    """Test metadata parameter"""
+    server.start()
+
+    res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 50,
+        "metadata": {
+            "user_id": "test_user_123"
+        },
+        "messages": [
+            {"role": "user", "content": "Hello"}
+        ]
+    })
+
+    assert res.status_code == 200
+    assert res.body["type"] == "message"
+
+
+# Compatibility tests
+
+def test_anthropic_vs_openai_different_response_format():
+    """Verify Anthropic format is different from OpenAI format"""
+    server.start()
+
+    # Make OpenAI request
+    openai_res = server.make_request("POST", "/v1/chat/completions", data={
+        "model": "test",
+        "max_tokens": 50,
+        "messages": [
+            {"role": "user", "content": "Hello"}
+        ]
+    })
+
+    # Make Anthropic request
+    anthropic_res = server.make_request("POST", "/v1/messages", data={
+        "model": "test",
+        "max_tokens": 50,
+        "messages": [
+            {"role": "user", "content": "Hello"}
+        ]
+    })
+
+    assert openai_res.status_code == 200
+    assert anthropic_res.status_code == 200
+
+    # OpenAI has "object", Anthropic has "type"
+    assert "object" in openai_res.body
+    assert "type" in anthropic_res.body
+    assert openai_res.body["object"] == "chat.completion"
+    assert anthropic_res.body["type"] == "message"
+
+    # OpenAI has "choices", Anthropic has "content"
+    assert "choices" in openai_res.body
+    assert "content" in anthropic_res.body
+
+    # Different usage field names
+    assert "prompt_tokens" in openai_res.body["usage"]
+    assert "input_tokens" in anthropic_res.body["usage"]
+    assert "completion_tokens" in openai_res.body["usage"]
+    assert "output_tokens" in anthropic_res.body["usage"]
+
+
+# Extended thinking tests with reasoning models
+
+@pytest.mark.slow
+@pytest.mark.parametrize("stream", [False, True])
+def test_anthropic_thinking_with_reasoning_model(stream):
+    """Test that thinking content blocks are properly returned for reasoning models"""
+    global server
+    server = ServerProcess()
+    server.model_hf_repo = "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF"
+    server.model_hf_file = "DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
+    server.reasoning_format = "deepseek"
+    server.jinja = True
+    server.n_ctx = 8192
+    server.n_predict = 1024
+    server.server_port = 8084
+    server.start(timeout_seconds=600)  # large model needs time to download
+
+    if stream:
+        res = server.make_stream_request("POST", "/v1/messages", data={
+            "model": "test",
+            "max_tokens": 1024,
+            "thinking": {
+                "type": "enabled",
+                "budget_tokens": 500
+            },
+            "messages": [
+                {"role": "user", "content": "What is 2+2?"}
+            ],
+            "stream": True
+        })
+
+        events = list(res)
+
+        # should have thinking content block events
+        thinking_starts = [e for e in events if
+            e.get("type") == "content_block_start" and
+            e.get("content_block", {}).get("type") == "thinking"]
+        assert len(thinking_starts) > 0, "Should have thinking content_block_start event"
+        assert thinking_starts[0]["index"] == 0, "Thinking block should be at index 0"
+
+        # should have thinking_delta events
+        thinking_deltas = [e for e in events if
+            e.get("type") == "content_block_delta" and
+            e.get("delta", {}).get("type") == "thinking_delta"]
+        assert len(thinking_deltas) > 0, "Should have thinking_delta events"
+
+        # should have signature_delta event before thinking block closes (Anthropic API requirement)
+        signature_deltas = [e for e in events if
+            e.get("type") == "content_block_delta" and
+            e.get("delta", {}).get("type") == "signature_delta"]
+        assert len(signature_deltas) > 0, "Should have signature_delta event for thinking block"
+
+        # should have text block after thinking
+        text_starts = [e for e in events if
+            e.get("type") == "content_block_start" and
+            e.get("content_block", {}).get("type") == "text"]
+        assert len(text_starts) > 0, "Should have text content_block_start event"
+        assert text_starts[0]["index"] == 1, "Text block should be at index 1 (after thinking)"
+    else:
+        res = server.make_request("POST", "/v1/messages", data={
+            "model": "test",
+            "max_tokens": 1024,
+            "thinking": {
+                "type": "enabled",
+                "budget_tokens": 500
+            },
+            "messages": [
+                {"role": "user", "content": "What is 2+2?"}
+            ]
+        })
+
+        assert res.status_code == 200
+        assert res.body["type"] == "message"
+
+        content = res.body["content"]
+        assert len(content) >= 2, "Should have at least thinking and text blocks"
+
+        # first block should be thinking
+        thinking_blocks = [b for b in content if b.get("type") == "thinking"]
+        assert len(thinking_blocks) > 0, "Should have thinking content block"
+        assert "thinking" in thinking_blocks[0], "Thinking block should have 'thinking' field"
+        assert len(thinking_blocks[0]["thinking"]) > 0, "Thinking content should not be empty"
+        assert "signature" in thinking_blocks[0], "Thinking block should have 'signature' field (Anthropic API requirement)"
+
+        # should also have text block
+        text_blocks = [b for b in content if b.get("type") == "text"]
+        assert len(text_blocks) > 0, "Should have text content block"
diff --git a/llama.cpp/tools/server/tests/unit/test_compat_oai_responses.py b/llama.cpp/tools/server/tests/unit/test_compat_oai_responses.py
new file mode 100644
index 0000000..7aab4a8
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_compat_oai_responses.py
@@ -0,0 +1,73 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server: ServerProcess
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+def test_responses_with_openai_library():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.responses.create(
+        model="gpt-4.1",
+        input=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_output_tokens=8,
+        temperature=0.8,
+    )
+    assert res.id.startswith("resp_")
+    assert res.output[0].id is not None
+    assert res.output[0].id.startswith("msg_")
+    assert match_regex("(Suddenly)+", res.output_text)
+
+def test_responses_stream_with_openai_library():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    stream = client.responses.create(
+        model="gpt-4.1",
+        input=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_output_tokens=8,
+        temperature=0.8,
+        stream=True,
+    )
+
+    gathered_text = ''
+    resp_id = ''
+    msg_id = ''
+    for r in stream:
+        if r.type == "response.created":
+            assert r.response.id.startswith("resp_")
+            resp_id = r.response.id
+        if r.type == "response.in_progress":
+            assert r.response.id == resp_id
+        if r.type == "response.output_item.added":
+            assert r.item.id is not None
+            assert r.item.id.startswith("msg_")
+            msg_id = r.item.id
+        if (r.type == "response.content_part.added" or
+            r.type == "response.output_text.delta" or
+            r.type == "response.output_text.done" or
+            r.type == "response.content_part.done"):
+            assert r.item_id == msg_id
+        if r.type == "response.output_item.done":
+            assert r.item.id == msg_id
+
+        if r.type == "response.output_text.delta":
+            gathered_text += r.delta
+        if r.type == "response.completed":
+            assert r.response.id.startswith("resp_")
+            assert r.response.output[0].id is not None
+            assert r.response.output[0].id.startswith("msg_")
+            assert gathered_text == r.response.output_text
+            assert match_regex("(Suddenly)+", r.response.output_text)
diff --git a/llama.cpp/tools/server/tests/unit/test_completion.py b/llama.cpp/tools/server/tests/unit/test_completion.py
new file mode 100644
index 0000000..2a98060
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_completion.py
@@ -0,0 +1,608 @@
+import pytest
+import requests
+import time
+import random
+
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+JSON_MULTIMODAL_KEY = "multimodal_data"
+JSON_PROMPT_STRING_KEY = "prompt_string"
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 64, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
+])
+def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": prompt,
+        "return_tokens": return_tokens,
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["prompt_n"] == n_prompt
+    assert res.body["timings"]["predicted_n"] == n_predicted
+    assert res.body["truncated"] == truncated
+    assert type(res.body["has_new_line"]) == bool
+    assert match_regex(re_content, res.body["content"])
+    if return_tokens:
+        assert len(res.body["tokens"]) > 0
+        assert all(type(tok) == int for tok in res.body["tokens"])
+    else:
+        assert res.body["tokens"] == []
+
+
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 64, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
+])
+def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": prompt,
+        "stream": True,
+    })
+    content = ""
+    for data in res:
+        assert "stop" in data and type(data["stop"]) == bool
+        if data["stop"]:
+            assert data["timings"]["prompt_n"] == n_prompt
+            assert data["timings"]["predicted_n"] == n_predicted
+            assert data["truncated"] == truncated
+            assert data["stop_type"] == "limit"
+            assert type(data["has_new_line"]) == bool
+            assert "generation_settings" in data
+            assert server.n_predict is not None
+            assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)
+            assert data["generation_settings"]["seed"] == server.seed
+            assert match_regex(re_content, content)
+        else:
+            assert len(data["tokens"]) > 0
+            assert all(type(tok) == int for tok in data["tokens"])
+            content += data["content"]
+
+
+def test_completion_stream_vs_non_stream():
+    global server
+    server.start()
+    res_stream = server.make_stream_request("POST", "/completion", data={
+        "n_predict": 8,
+        "prompt": "I believe the meaning of life is",
+        "stream": True,
+    })
+    res_non_stream = server.make_request("POST", "/completion", data={
+        "n_predict": 8,
+        "prompt": "I believe the meaning of life is",
+    })
+    content_stream = ""
+    for data in res_stream:
+        content_stream += data["content"]
+    assert content_stream == res_non_stream.body["content"]
+
+
+def test_completion_with_openai_library():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.completions.create(
+        model="davinci-002",
+        prompt="I believe the meaning of life is",
+        max_tokens=8,
+    )
+    assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
+    assert res.choices[0].finish_reason == "length"
+    assert res.choices[0].text is not None
+    assert match_regex("(going|bed)+", res.choices[0].text)
+
+
+def test_completion_stream_with_openai_library():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.completions.create(
+        model="davinci-002",
+        prompt="I believe the meaning of life is",
+        max_tokens=8,
+        stream=True,
+    )
+    output_text = ''
+    for data in res:
+        choice = data.choices[0]
+        if choice.finish_reason is None:
+            assert choice.text is not None
+            output_text += choice.text
+    assert match_regex("(going|bed)+", output_text)
+
+
+# Test case from https://github.com/ggml-org/llama.cpp/issues/13780
+@pytest.mark.slow
+def test_completion_stream_with_openai_library_stops():
+    global server
+    server.model_hf_repo = "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M"
+    server.model_hf_file = None
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.completions.create(
+        model="davinci-002",
+        prompt="System: You are helpfull assistant.\nAssistant:\nHey! How could I help?\nUser:\nTell me a joke.\nAssistant:\n",
+        stop=["User:\n", "Assistant:\n"],
+        max_tokens=200,
+        stream=True,
+    )
+    output_text = ''
+    for data in res:
+        choice = data.choices[0]
+        if choice.finish_reason is None:
+            assert choice.text is not None
+            output_text += choice.text
+    assert match_regex("Sure, here's one for[\\s\\S]*", output_text), f'Unexpected output: {output_text}'
+
+
+@pytest.mark.parametrize("n_slots", [1, 2])
+def test_consistent_result_same_seed(n_slots: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    last_res = None
+    for _ in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": 42,
+            "temperature": 0.0,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] == last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.parametrize("n_slots", [1, 2])
+def test_different_result_different_seed(n_slots: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    last_res = None
+    for seed in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": seed,
+            "temperature": 1.0,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] != last_res.body["content"]
+        last_res = res
+
+# TODO figure why it don't work with temperature = 1
+# @pytest.mark.parametrize("temperature", [0.0, 1.0])
+@pytest.mark.parametrize("n_batch", [16, 32])
+@pytest.mark.parametrize("temperature", [0.0])
+def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
+    global server
+    server.n_batch = n_batch
+    server.start()
+    last_res = None
+    for _ in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": 42,
+            "temperature": temperature,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] == last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.skip(reason="This test fails on linux, need to be fixed")
+def test_cache_vs_nocache_prompt():
+    global server
+    server.start()
+    res_cache = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": True,
+    })
+    res_no_cache = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": False,
+    })
+    assert res_cache.body["content"] == res_no_cache.body["content"]
+
+
+def test_nocache_long_input_prompt():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is"*32,
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": False,
+    })
+    assert res.status_code == 400
+
+def test_json_prompt_no_mtmd():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": { JSON_PROMPT_STRING_KEY: "I believe the meaning of life is" },
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": False,
+    })
+    assert res.status_code == 200
+
+def test_json_prompt_mtm_error_when_not_supported():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": { JSON_PROMPT_STRING_KEY: "I believe the meaning of life is <__media__>", JSON_MULTIMODAL_KEY: "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" },
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": False,
+    })
+    # MTMD is disabled on this model, so this should fail.
+    assert res.status_code != 200
+
+def test_completion_with_tokens_input():
+    global server
+    server.temperature = 0.0
+    server.start()
+    prompt_str = "I believe the meaning of life is"
+    res = server.make_request("POST", "/tokenize", data={
+        "content": prompt_str,
+        "add_special": True,
+    })
+    assert res.status_code == 200
+    tokens = res.body["tokens"]
+
+    # single completion
+    res = server.make_request("POST", "/completion", data={
+        "prompt": tokens,
+    })
+    assert res.status_code == 200
+    assert type(res.body["content"]) == str
+
+    # batch completion
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [tokens, tokens],
+    })
+    assert res.status_code == 200
+    assert type(res.body) == list
+    assert len(res.body) == 2
+    assert res.body[0]["content"] == res.body[1]["content"]
+
+    # mixed string and tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [tokens, prompt_str],
+    })
+    assert res.status_code == 200
+    assert type(res.body) == list
+    assert len(res.body) == 2
+    assert res.body[0]["content"] == res.body[1]["content"]
+
+    # mixed JSON and tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [
+            tokens,
+            {
+                JSON_PROMPT_STRING_KEY: "I believe the meaning of life is",
+            },
+        ],
+    })
+    assert res.status_code == 200
+    assert type(res.body) == list
+    assert len(res.body) == 2
+    assert res.body[0]["content"] == res.body[1]["content"]
+
+    # mixed string and tokens in one sequence
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str],
+    })
+    assert res.status_code == 200
+    assert type(res.body["content"]) == str
+
+
+@pytest.mark.parametrize("n_slots,n_requests", [
+    (1, 3),
+    (2, 2),
+    (2, 4),
+    (4, 2), # some slots must be idle
+    (4, 6),
+])
+def test_completion_parallel_slots(n_slots: int, n_requests: int):
+    global server
+    server.n_slots = n_slots
+    server.temperature = 0.0
+    server.start()
+
+    PROMPTS = [
+        ("Write a very long book.", "(very|special|big)+"),
+        ("Write another a poem.", "(small|house)+"),
+        ("What is LLM?", "(Dad|said)+"),
+        ("The sky is blue and I love it.", "(climb|leaf)+"),
+        ("Write another very long music lyrics.", "(friends|step|sky)+"),
+        ("Write a very long joke.", "(cat|Whiskers)+"),
+    ]
+    def check_slots_status():
+        should_all_slots_busy = n_requests >= n_slots
+        time.sleep(0.1)
+        res = server.make_request("GET", "/slots")
+        n_busy = sum([1 for slot in res.body if slot["is_processing"]])
+        if should_all_slots_busy:
+            assert n_busy == n_slots
+        else:
+            assert n_busy <= n_slots
+
+    tasks = []
+    for i in range(n_requests):
+        prompt, re_content = PROMPTS[i % len(PROMPTS)]
+        tasks.append((server.make_request, ("POST", "/completion", {
+            "prompt": prompt,
+            "seed": 42,
+            "temperature": 1.0,
+        })))
+    tasks.append((check_slots_status, ()))
+    results = parallel_function_calls(tasks)
+
+    # check results
+    for i in range(n_requests):
+        prompt, re_content = PROMPTS[i % len(PROMPTS)]
+        res = results[i]
+        assert res.status_code == 200
+        assert type(res.body["content"]) == str
+        assert len(res.body["content"]) > 10
+        # FIXME: the result is not deterministic when using other slot than slot 0
+        # assert match_regex(re_content, res.body["content"])
+
+
+@pytest.mark.parametrize(
+    "n_ctx,n_slots,n_predict_vals,expected_success",
+    [
+        (256, 4, [80, 40, 80, 80], [True,  True,  True,  True]),
+        (256, 4, [70, 70, 70, 70], [False, False, False, False]),
+        (256, 4, [90, 90, 40, 90], [False, False, True,  False]),
+        (256, 4, [90, 90, 40, 75], [True,  True,  True,  True]),
+    ],
+)
+def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
+    global server
+    server.n_slots = n_slots
+    server.kv_unified = True
+    server.n_ctx = n_ctx
+    server.start()
+    prompt = "A"
+    tasks = []
+    for n_predict in n_predict_vals:
+        tasks.append((server.make_request, ("POST", "/completion", {"prompt": prompt, "n_predict": n_predict})))
+    results = parallel_function_calls(tasks)
+    for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
+        if expect_ok:
+            assert res.status_code == 200
+
+        # note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581
+        if res.status_code == 200:
+            assert "content" in res.body
+            if "timings" in res.body:
+                assert res.body["timings"]["predicted_n"] == n_predict
+
+
+@pytest.mark.parametrize(
+    "prompt,n_predict,response_fields",
+    [
+        ("I believe the meaning of life is", 8, []),
+        ("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
+    ],
+)
+def test_completion_response_fields(
+    prompt: str, n_predict: int, response_fields: list[str]
+):
+    global server
+    server.start()
+    res = server.make_request(
+        "POST",
+        "/completion",
+        data={
+            "n_predict": n_predict,
+            "prompt": prompt,
+            "response_fields": response_fields,
+        },
+    )
+    assert res.status_code == 200
+    assert "content" in res.body
+    assert len(res.body["content"])
+    if len(response_fields):
+        assert res.body["generation_settings/n_predict"] == n_predict
+        assert res.body["prompt"] == "<s> " + prompt
+        assert isinstance(res.body["content"], str)
+        assert len(res.body) == len(response_fields)
+    else:
+        assert len(res.body)
+        assert "generation_settings" in res.body
+
+
+def test_n_probs():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "n_probs": 10,
+        "temperature": 0.0,
+        "n_predict": 5,
+    })
+    assert res.status_code == 200
+    assert "completion_probabilities" in res.body
+    assert len(res.body["completion_probabilities"]) == 5
+    for tok in res.body["completion_probabilities"]:
+        assert "id" in tok and tok["id"] > 0
+        assert "token" in tok and type(tok["token"]) == str
+        assert "logprob" in tok and tok["logprob"] <= 0.0
+        assert "bytes" in tok and type(tok["bytes"]) == list
+        assert len(tok["top_logprobs"]) == 10
+        for prob in tok["top_logprobs"]:
+            assert "id" in prob and prob["id"] > 0
+            assert "token" in prob and type(prob["token"]) == str
+            assert "logprob" in prob and prob["logprob"] <= 0.0
+            assert "bytes" in prob and type(prob["bytes"]) == list
+
+
+def test_n_probs_stream():
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "n_probs": 10,
+        "temperature": 0.0,
+        "n_predict": 5,
+        "stream": True,
+    })
+    for data in res:
+        if data["stop"] == False:
+            assert "completion_probabilities" in data
+            assert len(data["completion_probabilities"]) == 1
+            for tok in data["completion_probabilities"]:
+                assert "id" in tok and tok["id"] > 0
+                assert "token" in tok and type(tok["token"]) == str
+                assert "logprob" in tok and tok["logprob"] <= 0.0
+                assert "bytes" in tok and type(tok["bytes"]) == list
+                assert len(tok["top_logprobs"]) == 10
+                for prob in tok["top_logprobs"]:
+                    assert "id" in prob and prob["id"] > 0
+                    assert "token" in prob and type(prob["token"]) == str
+                    assert "logprob" in prob and prob["logprob"] <= 0.0
+                    assert "bytes" in prob and type(prob["bytes"]) == list
+
+
+def test_n_probs_post_sampling():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "n_probs": 10,
+        "temperature": 0.0,
+        "n_predict": 5,
+        "post_sampling_probs": True,
+    })
+    assert res.status_code == 200
+    assert "completion_probabilities" in res.body
+    assert len(res.body["completion_probabilities"]) == 5
+    for tok in res.body["completion_probabilities"]:
+        assert "id" in tok and tok["id"] > 0
+        assert "token" in tok and type(tok["token"]) == str
+        assert "prob" in tok and 0.0 < tok["prob"] <= 1.0
+        assert "bytes" in tok and type(tok["bytes"]) == list
+        assert len(tok["top_probs"]) == 10
+        for prob in tok["top_probs"]:
+            assert "id" in prob and prob["id"] > 0
+            assert "token" in prob and type(prob["token"]) == str
+            assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0
+            assert "bytes" in prob and type(prob["bytes"]) == list
+        # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
+        assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
+
+
+@pytest.mark.parametrize("tokenize,openai_style", [(False, False), (False, True), (True, False), (True, True)])
+def test_logit_bias(tokenize, openai_style):
+    global server
+    server.start()
+
+    exclude = ["i", "I", "the", "The", "to", "a", "an", "be", "is", "was", "but", "But", "and", "And", "so", "So", "you", "You", "he", "He", "she", "She", "we", "We", "they", "They", "it", "It", "his", "His", "her", "Her", "book", "Book"]
+
+    logit_bias = []
+    if tokenize:
+        res = server.make_request("POST", "/tokenize", data={
+            "content": " " + " ".join(exclude) + " ",
+        })
+        assert res.status_code == 200
+        tokens = res.body["tokens"]
+        logit_bias = [[tok, -100] for tok in tokens]
+
+    else:
+        logit_bias = [[" " + tok + " ", -100] for tok in exclude]
+
+    if openai_style:
+        logit_bias = {el[0]: -100 for el in logit_bias}
+
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 64,
+        "prompt": "What is the best book",
+        "logit_bias": logit_bias,
+        "temperature": 0.0
+    })
+    assert res.status_code == 200
+    output_text = res.body["content"]
+    assert all(output_text.find(" " + tok + " ") == -1 for tok in exclude)
+
+
+def test_cancel_request():
+    global server
+    server.n_ctx = 4096
+    server.n_predict = -1
+    server.n_slots = 1
+    server.server_slots = True
+    server.start()
+    # send a request that will take a long time, but cancel it before it finishes
+    try:
+        server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+        }, timeout=0.1)
+    except requests.exceptions.ReadTimeout:
+        pass # expected
+    # make sure the slot is free
+    time.sleep(1) # wait for HTTP_POLLING_SECONDS
+    res = server.make_request("GET", "/slots")
+    assert res.body[0]["is_processing"] == False
+
+
+# this test exercises the host-memory prompt cache
+# ref: https://github.com/ggml-org/llama.cpp/pull/16391
+# ref: https://github.com/ggml-org/llama.cpp/pull/17078
+def test_completion_prompt_cache():
+    global server
+    server.n_slots = 2
+    server.kv_unified = True
+    server.start()
+
+    for _ in range(16):
+        # generate alternating random prompts with variable lengths in order to get them in and out of the cache
+        r = random.randint(0, 4)
+        prompt = (" Hello " +  str(r)) * (40 + r)
+        n_prompt = (40 + r)*5 + 2
+        n_predict = random.randint(1, 8)
+
+        res = server.make_request(
+            "POST",
+            "/completion",
+            data={
+                "prompt": prompt,
+                "n_predict": n_predict,
+            },
+        )
+
+        assert res.status_code == 200
+        assert "content" in res.body
+        content = res.body["content"]
+        assert isinstance(content, str)
+        assert len(content) > 0
+
+        assert type(res.body["has_new_line"]) == bool
+        assert "timings" in res.body
+        timings = res.body["timings"]
+
+        assert "prompt_n" in timings and timings["prompt_n"] + timings["cache_n"] == n_prompt
+        assert "predicted_n" in timings and timings["predicted_n"] == n_predict
+        assert "tokens" in res.body and isinstance(res.body["tokens"], list)
diff --git a/llama.cpp/tools/server/tests/unit/test_ctx_shift.py b/llama.cpp/tools/server/tests/unit/test_ctx_shift.py
new file mode 100644
index 0000000..7b047b7
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_ctx_shift.py
@@ -0,0 +1,89 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+SHORT_TEXT = """
+Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+""".strip()
+
+LONG_TEXT = """
+Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+""".strip()
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.n_ctx = 512
+    server.n_slots = 2
+    server.n_predict = 128
+
+
+def test_ctx_shift_enabled():
+    # the prompt is 226 tokens
+    # the slot context is 512/2 = 256 tokens
+    # 96 tokens are generated thanks to shifting the context when it gets full
+    global server
+    server.enable_ctx_shift = True
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 96,
+        "prompt": SHORT_TEXT,
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["prompt_n"] == 226
+    assert res.body["timings"]["predicted_n"] == 96
+    assert res.body["truncated"] is True
+
+
+@pytest.mark.parametrize("n_predict,n_token_output,truncated", [
+    (64, 64, False),
+    (-1, 248, True), # 8 tokens prompt + 248 tokens generated = 256 tokens total
+])
+def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
+    global server
+    server.n_predict = -1
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": "Hi how are you",
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["predicted_n"] == n_token_output
+    assert res.body["truncated"] == truncated
+
+
+def test_ctx_shift_disabled_long_prompt():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 64,
+        "prompt": LONG_TEXT,
+    })
+    assert res.status_code != 200
+    assert "error" in res.body
+    assert "exceeds the available context size" in res.body["error"]["message"]
+
+def test_ctx_shift_disabled_stream():
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/v1/completions", data={
+        "n_predict": 256,
+        "prompt": "Once",
+        "stream": True,
+    })
+    content = ""
+    for data in res:
+        choice = data["choices"][0]
+        if choice["finish_reason"] == "length":
+            assert len(content) > 0
+        else:
+            assert choice["finish_reason"] is None
+            content += choice["text"]
diff --git a/llama.cpp/tools/server/tests/unit/test_embedding.py b/llama.cpp/tools/server/tests/unit/test_embedding.py
new file mode 100644
index 0000000..50601b8
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_embedding.py
@@ -0,0 +1,257 @@
+import base64
+import struct
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.bert_bge_small()
+
+EPSILON = 1e-3
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.bert_bge_small()
+
+
+def test_embedding_single():
+    global server
+    server.pooling = 'last'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": "I believe the meaning of life is",
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 1
+    assert 'embedding' in res.body['data'][0]
+    assert len(res.body['data'][0]['embedding']) > 1
+
+    # make sure embedding vector is normalized
+    assert abs(sum([x ** 2 for x in res.body['data'][0]['embedding']]) - 1) < EPSILON
+
+
+def test_embedding_multiple():
+    global server
+    server.pooling = 'last'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "Write a joke about AI from a very long prompt which will not be truncated",
+            "This is a test",
+            "This is another test",
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 4
+    for d in res.body['data']:
+        assert 'embedding' in d
+        assert len(d['embedding']) > 1
+
+
+def test_embedding_multiple_with_fa():
+    server = ServerPreset.bert_bge_small_with_fa()
+    server.pooling = 'last'
+    server.start()
+    # one of these should trigger the FA branch (i.e. context size % 256 == 0)
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "a "*253,
+            "b "*254,
+            "c "*255,
+            "d "*256,
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 4
+    for d in res.body['data']:
+        assert 'embedding' in d
+        assert len(d['embedding']) > 1
+
+
+@pytest.mark.parametrize(
+    "input,is_multi_prompt",
+    [
+        # do not crash on empty input
+        ("", False),
+        # single prompt
+        ("string", False),
+        ([12, 34, 56], False),
+        ([12, 34, "string", 56, 78], False),
+        # multiple prompts
+        (["string1", "string2"], True),
+        (["string1", [12, 34, 56]], True),
+        ([[12, 34, 56], [12, 34, 56]], True),
+        ([[12, 34, 56], [12, "string", 34, 56]], True),
+    ]
+)
+def test_embedding_mixed_input(input, is_multi_prompt: bool):
+    global server
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={"input": input})
+    assert res.status_code == 200
+    data = res.body['data']
+    if is_multi_prompt:
+        assert len(data) == len(input)
+        for d in data:
+            assert 'embedding' in d
+            assert len(d['embedding']) > 1
+    else:
+        assert 'embedding' in data[0]
+        assert len(data[0]['embedding']) > 1
+
+
+def test_embedding_pooling_none():
+    global server
+    server.pooling = 'none'
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": "hello hello hello",
+    })
+    assert res.status_code == 200
+    assert 'embedding' in res.body[0]
+    assert len(res.body[0]['embedding']) == 5 # 3 text tokens + 2 special
+
+    # make sure embedding vector is not normalized
+    for x in res.body[0]['embedding']:
+        assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON
+
+
+def test_embedding_pooling_none_oai():
+    global server
+    server.pooling = 'none'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": "hello hello hello",
+    })
+
+    # /v1/embeddings does not support pooling type 'none'
+    assert res.status_code == 400
+    assert "error" in res.body
+
+
+def test_embedding_openai_library_single():
+    global server
+    server.pooling = 'last'
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is")
+    assert len(res.data) == 1
+    assert len(res.data[0].embedding) > 1
+
+
+def test_embedding_openai_library_multiple():
+    global server
+    server.pooling = 'last'
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.embeddings.create(model="text-embedding-3-small", input=[
+        "I believe the meaning of life is",
+        "Write a joke about AI from a very long prompt which will not be truncated",
+        "This is a test",
+        "This is another test",
+    ])
+    assert len(res.data) == 4
+    for d in res.data:
+        assert len(d.embedding) > 1
+
+
+def test_embedding_error_prompt_too_long():
+    global server
+    server.pooling = 'last'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": "This is a test " * 512,
+    })
+    assert res.status_code != 200
+    assert "too large" in res.body["error"]["message"]
+
+
+def test_same_prompt_give_same_result():
+    server.pooling = 'last'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 5
+    for i in range(1, len(res.body['data'])):
+        v0 = res.body['data'][0]['embedding']
+        vi = res.body['data'][i]['embedding']
+        for x, y in zip(v0, vi):
+            assert abs(x - y) < EPSILON
+
+
+@pytest.mark.parametrize(
+    "content,n_tokens",
+    [
+        ("I believe the meaning of life is", 9),
+        ("This is a test", 6),
+    ]
+)
+def test_embedding_usage_single(content, n_tokens):
+    global server
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={"input": content})
+    assert res.status_code == 200
+    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
+    assert res.body['usage']['prompt_tokens'] == n_tokens
+
+
+def test_embedding_usage_multiple():
+    global server
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+        ],
+    })
+    assert res.status_code == 200
+    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
+    assert res.body['usage']['prompt_tokens'] == 2 * 9
+
+
+def test_embedding_openai_library_base64():
+    server.start()
+    test_input = "Test base64 embedding output"
+
+    # get embedding in default format
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": test_input
+    })
+    assert res.status_code == 200
+    vec0 = res.body["data"][0]["embedding"]
+
+    # get embedding in base64 format
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": test_input,
+        "encoding_format": "base64"
+    })
+
+    assert res.status_code == 200
+    assert "data" in res.body
+    assert len(res.body["data"]) == 1
+
+    embedding_data = res.body["data"][0]
+    assert "embedding" in embedding_data
+    assert isinstance(embedding_data["embedding"], str)
+
+    # Verify embedding is valid base64
+    decoded = base64.b64decode(embedding_data["embedding"])
+    # Verify decoded data can be converted back to float array
+    float_count = len(decoded) // 4  # 4 bytes per float
+    floats = struct.unpack(f'{float_count}f', decoded)
+    assert len(floats) > 0
+    assert all(isinstance(x, float) for x in floats)
+    assert len(floats) == len(vec0)
+
+    # make sure the decoded data is the same as the original
+    for x, y in zip(floats, vec0):
+        assert abs(x - y) < EPSILON
diff --git a/llama.cpp/tools/server/tests/unit/test_infill.py b/llama.cpp/tools/server/tests/unit/test_infill.py
new file mode 100644
index 0000000..cd1a391
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_infill.py
@@ -0,0 +1,77 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama_infill()
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama_infill()
+
+
+def test_infill_without_input_extra():
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert match_regex("(Ann|small|shiny|Daddy|Jimmy)+", res.body["content"])
+
+
+def test_infill_with_input_extra():
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "input_extra": [{
+            "filename": "llama.h",
+            "text": "LLAMA_API int32_t llama_n_threads();\n"
+        }],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert match_regex("(Dad|excited|park|Jimmy)+", res.body["content"])
+
+
+@pytest.mark.parametrize("input_extra", [
+    {},
+    {"filename": "ok"},
+    {"filename": 123},
+    {"filename": 123, "text": "abc"},
+    {"filename": 123, "text": 456},
+])
+def test_invalid_input_extra_req(input_extra):
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "input_extra": [input_extra],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 400
+    assert "error" in res.body
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
+def test_with_qwen_model():
+    global server
+    server.model_file = None
+    server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF"
+    server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf"
+    server.start(timeout_seconds=600)
+    res = server.make_request("POST", "/infill", data={
+        "input_extra": [{
+            "filename": "llama.h",
+            "text": "LLAMA_API int32_t llama_n_threads();\n"
+        }],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert res.body["content"] == "n_threads();\n    printf(\"Number of threads: %d\\n\", n_threads);\n    return 0;\n"
diff --git a/llama.cpp/tools/server/tests/unit/test_lora.py b/llama.cpp/tools/server/tests/unit/test_lora.py
new file mode 100644
index 0000000..00b2f24
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_lora.py
@@ -0,0 +1,115 @@
+import pytest
+from utils import *
+
+server = ServerPreset.stories15m_moe()
+
+LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.stories15m_moe()
+    server.lora_files = [download_file(LORA_FILE_URL)]
+
+
+@pytest.mark.parametrize("scale,re_content", [
+    # without applying lora, the model should behave like a bedtime story generator
+    (0.0, "(little|girl|three|years|old)+"),
+    # with lora, the model should behave like a Shakespearean text generator
+    (1.0, "(eye|love|glass|sun)+"),
+])
+def test_lora(scale: float, re_content: str):
+    global server
+    server.start()
+    res_lora_control = server.make_request("POST", "/lora-adapters", data=[
+        {"id": 0, "scale": scale}
+    ])
+    assert res_lora_control.status_code == 200
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "Look in thy glass",
+    })
+    assert res.status_code == 200
+    assert match_regex(re_content, res.body["content"])
+
+
+def test_lora_per_request():
+    global server
+    server.n_slots = 4
+    server.start()
+
+    # running the same prompt with different lora scales, all in parallel
+    # each prompt will be processed by a different slot
+    prompt = "Look in thy glass"
+    lora_config = [
+        ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
+        ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
+        ( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ),
+        ( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ),
+        ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
+        ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
+    ]
+
+    tasks = [(
+        server.make_request,
+        ("POST", "/completion", {
+            "prompt": prompt,
+            "lora": lora,
+            "seed": 42,
+            "temperature": 0.0,
+            "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+    ) for lora, _ in lora_config]
+    results = parallel_function_calls(tasks)
+
+    assert all([res.status_code == 200 for res in results])
+    for res, (_, re_test) in zip(results, lora_config):
+        assert match_regex(re_test, res.body["content"])
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
+def test_with_big_model():
+    server = ServerProcess()
+    server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
+    server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf"
+    server.model_alias = "Llama-3.2-8B-Instruct"
+    server.n_slots = 4
+    server.n_ctx = server.n_slots * 1024
+    server.n_predict = 64
+    server.temperature = 0.0
+    server.seed = 42
+    server.lora_files = [
+        download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"),
+        # TODO: find & add other lora adapters for this model
+    ]
+    server.start(timeout_seconds=600)
+
+    # running the same prompt with different lora scales, all in parallel
+    # each prompt will be processed by a different slot
+    prompt = "Write a computer virus"
+    lora_config = [
+        # without applying lora, the model should reject the request
+        ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
+        ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
+        ( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ),
+        # with 0.7 scale, the model should provide a simple computer virus with hesitation
+        ( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ),
+        # with 1.5 scale, the model should confidently provide a computer virus
+        ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
+        ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
+    ]
+
+    tasks = [(
+        server.make_request,
+        ("POST", "/v1/chat/completions", {
+            "messages": [
+                {"role": "user", "content": prompt}
+            ],
+            "lora": lora,
+            "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+    ) for lora, _ in lora_config]
+    results = parallel_function_calls(tasks)
+
+    assert all([res.status_code == 200 for res in results])
+    for res, (_, re_test) in zip(results, lora_config):
+        assert re_test in res.body["choices"][0]["message"]["content"]
diff --git a/llama.cpp/tools/server/tests/unit/test_rerank.py b/llama.cpp/tools/server/tests/unit/test_rerank.py
new file mode 100644
index 0000000..ded8267
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_rerank.py
@@ -0,0 +1,146 @@
+import pytest
+from utils import *
+
+server = ServerPreset.jina_reranker_tiny()
+
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.jina_reranker_tiny()
+
+
+TEST_DOCUMENTS = [
+    "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.",
+    "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.",
+    "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
+    "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine."
+]
+
+
+def test_rerank():
+    global server
+    server.start()
+    res = server.make_request("POST", "/rerank", data={
+        "query": "Machine learning is",
+        "documents": TEST_DOCUMENTS,
+    })
+    assert res.status_code == 200
+    assert len(res.body["results"]) == 4
+
+    most_relevant = res.body["results"][0]
+    least_relevant = res.body["results"][0]
+    for doc in res.body["results"]:
+        if doc["relevance_score"] > most_relevant["relevance_score"]:
+            most_relevant = doc
+        if doc["relevance_score"] < least_relevant["relevance_score"]:
+            least_relevant = doc
+
+    assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
+    assert most_relevant["index"] == 2
+    assert least_relevant["index"] == 3
+
+
+def test_rerank_tei_format():
+    global server
+    server.start()
+    res = server.make_request("POST", "/rerank", data={
+        "query": "Machine learning is",
+        "texts": TEST_DOCUMENTS,
+    })
+    assert res.status_code == 200
+    assert len(res.body) == 4
+
+    most_relevant = res.body[0]
+    least_relevant = res.body[0]
+    for doc in res.body:
+        if doc["score"] > most_relevant["score"]:
+            most_relevant = doc
+        if doc["score"] < least_relevant["score"]:
+            least_relevant = doc
+
+    assert most_relevant["score"] > least_relevant["score"]
+    assert most_relevant["index"] == 2
+    assert least_relevant["index"] == 3
+
+
+@pytest.mark.parametrize("documents", [
+    [],
+    None,
+    123,
+    [1, 2, 3],
+])
+def test_invalid_rerank_req(documents):
+    global server
+    server.start()
+    res = server.make_request("POST", "/rerank", data={
+        "query": "Machine learning is",
+        "documents": documents,
+    })
+    assert res.status_code == 400
+    assert "error" in res.body
+
+
+@pytest.mark.parametrize(
+    "query,doc1,doc2,n_tokens",
+    [
+        ("Machine learning is", "A machine", "Learning is", 19),
+        ("Which city?", "Machine learning is ", "Paris, capitale de la", 26),
+    ]
+)
+def test_rerank_usage(query, doc1, doc2, n_tokens):
+    global server
+    server.start()
+
+    res = server.make_request("POST", "/rerank", data={
+        "query": query,
+        "documents": [
+            doc1,
+            doc2,
+        ]
+    })
+    assert res.status_code == 200
+    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
+    assert res.body['usage']['prompt_tokens'] == n_tokens
+
+
+@pytest.mark.parametrize("top_n,expected_len", [
+    (None, len(TEST_DOCUMENTS)),  # no top_n parameter
+    (2, 2),
+    (4, 4),
+    (99, len(TEST_DOCUMENTS)),    # higher than available docs
+])
+def test_rerank_top_n(top_n, expected_len):
+    global server
+    server.start()
+    data = {
+        "query": "Machine learning is",
+        "documents": TEST_DOCUMENTS,
+    }
+    if top_n is not None:
+        data["top_n"] = top_n
+
+    res = server.make_request("POST", "/rerank", data=data)
+    assert res.status_code == 200
+    assert len(res.body["results"]) == expected_len
+
+
+@pytest.mark.parametrize("top_n,expected_len", [
+    (None, len(TEST_DOCUMENTS)),  # no top_n parameter
+    (2, 2),
+    (4, 4),
+    (99, len(TEST_DOCUMENTS)),    # higher than available docs
+])
+def test_rerank_tei_top_n(top_n, expected_len):
+    global server
+    server.start()
+    data = {
+        "query": "Machine learning is",
+        "texts": TEST_DOCUMENTS,
+    }
+    if top_n is not None:
+        data["top_n"] = top_n
+
+    res = server.make_request("POST", "/rerank", data=data)
+    assert res.status_code == 200
+    assert len(res.body) == expected_len
diff --git a/llama.cpp/tools/server/tests/unit/test_router.py b/llama.cpp/tools/server/tests/unit/test_router.py
new file mode 100644
index 0000000..e85f2c3
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_router.py
@@ -0,0 +1,194 @@
+import pytest
+from utils import *
+
+server: ServerProcess
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.router()
+
+
+@pytest.mark.parametrize(
+    "model,success",
+    [
+        ("ggml-org/tinygemma3-GGUF:Q8_0", True),
+        ("non-existent/model", False),
+    ]
+)
+def test_router_chat_completion_stream(model: str, success: bool):
+    global server
+    server.start()
+    content = ""
+    ex: ServerError | None = None
+    try:
+        res = server.make_stream_request("POST", "/chat/completions", data={
+            "model": model,
+            "max_tokens": 16,
+            "messages": [
+                {"role": "user", "content": "hello"},
+            ],
+            "stream": True,
+        })
+        for data in res:
+            if data["choices"]:
+                choice = data["choices"][0]
+                if choice["finish_reason"] in ["stop", "length"]:
+                    assert "content" not in choice["delta"]
+                else:
+                    assert choice["finish_reason"] is None
+                    content += choice["delta"]["content"] or ''
+    except ServerError as e:
+        ex = e
+
+    if success:
+        assert ex is None
+        assert len(content) > 0
+    else:
+        assert ex is not None
+        assert content == ""
+
+
+def _get_model_status(model_id: str) -> str:
+    res = server.make_request("GET", "/models")
+    assert res.status_code == 200
+    for item in res.body.get("data", []):
+        if item.get("id") == model_id or item.get("model") == model_id:
+            return item["status"]["value"]
+    raise AssertionError(f"Model {model_id} not found in /models response")
+
+
+def _wait_for_model_status(model_id: str, desired: set[str], timeout: int = 60) -> str:
+    deadline = time.time() + timeout
+    last_status = None
+    while time.time() < deadline:
+        last_status = _get_model_status(model_id)
+        if last_status in desired:
+            return last_status
+        time.sleep(1)
+    raise AssertionError(
+        f"Timed out waiting for {model_id} to reach {desired}, last status: {last_status}"
+    )
+
+
+def _load_model_and_wait(
+    model_id: str, timeout: int = 60, headers: dict | None = None
+) -> None:
+    load_res = server.make_request(
+        "POST", "/models/load", data={"model": model_id}, headers=headers
+    )
+    assert load_res.status_code == 200
+    assert isinstance(load_res.body, dict)
+    assert load_res.body.get("success") is True
+    _wait_for_model_status(model_id, {"loaded"}, timeout=timeout)
+
+
+def test_router_unload_model():
+    global server
+    server.start()
+    model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
+
+    _load_model_and_wait(model_id)
+
+    unload_res = server.make_request("POST", "/models/unload", data={"model": model_id})
+    assert unload_res.status_code == 200
+    assert unload_res.body.get("success") is True
+    _wait_for_model_status(model_id, {"unloaded"})
+
+
+def test_router_models_max_evicts_lru():
+    global server
+    server.models_max = 2
+    server.start()
+
+    candidate_models = [
+        "ggml-org/tinygemma3-GGUF:Q8_0",
+        "ggml-org/test-model-stories260K",
+        "ggml-org/test-model-stories260K-infill",
+    ]
+
+    # Load only the first 2 models to fill the cache
+    first, second, third = candidate_models[:3]
+
+    _load_model_and_wait(first, timeout=120)
+    _load_model_and_wait(second, timeout=120)
+
+    # Verify both models are loaded
+    assert _get_model_status(first) == "loaded"
+    assert _get_model_status(second) == "loaded"
+
+    # Load the third model - this should trigger LRU eviction of the first model
+    _load_model_and_wait(third, timeout=120)
+
+    # Verify eviction: third is loaded, first was evicted
+    assert _get_model_status(third) == "loaded"
+    assert _get_model_status(first) == "unloaded"
+
+
+def test_router_no_models_autoload():
+    global server
+    server.no_models_autoload = True
+    server.start()
+    model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
+
+    res = server.make_request(
+        "POST",
+        "/v1/chat/completions",
+        data={
+            "model": model_id,
+            "messages": [{"role": "user", "content": "hello"}],
+            "max_tokens": 4,
+        },
+    )
+    assert res.status_code == 400
+    assert "error" in res.body
+
+    _load_model_and_wait(model_id)
+
+    success_res = server.make_request(
+        "POST",
+        "/v1/chat/completions",
+        data={
+            "model": model_id,
+            "messages": [{"role": "user", "content": "hello"}],
+            "max_tokens": 4,
+        },
+    )
+    assert success_res.status_code == 200
+    assert "error" not in success_res.body
+
+
+def test_router_api_key_required():
+    global server
+    server.api_key = "sk-router-secret"
+    server.start()
+
+    model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
+    auth_headers = {"Authorization": f"Bearer {server.api_key}"}
+
+    res = server.make_request(
+        "POST",
+        "/v1/chat/completions",
+        data={
+            "model": model_id,
+            "messages": [{"role": "user", "content": "hello"}],
+            "max_tokens": 4,
+        },
+    )
+    assert res.status_code == 401
+    assert res.body.get("error", {}).get("type") == "authentication_error"
+
+    _load_model_and_wait(model_id, headers=auth_headers)
+
+    authed = server.make_request(
+        "POST",
+        "/v1/chat/completions",
+        headers=auth_headers,
+        data={
+            "model": model_id,
+            "messages": [{"role": "user", "content": "hello"}],
+            "max_tokens": 4,
+        },
+    )
+    assert authed.status_code == 200
+    assert "error" not in authed.body
diff --git a/llama.cpp/tools/server/tests/unit/test_security.py b/llama.cpp/tools/server/tests/unit/test_security.py
new file mode 100644
index 0000000..8c38b89
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_security.py
@@ -0,0 +1,127 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+TEST_API_KEY = "sk-this-is-the-secret-key"
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.api_key = TEST_API_KEY
+
+
+@pytest.mark.parametrize("endpoint", ["/health", "/models"])
+def test_access_public_endpoint(endpoint: str):
+    global server
+    server.start()
+    res = server.make_request("GET", endpoint)
+    assert res.status_code == 200
+    assert "error" not in res.body
+
+
+@pytest.mark.parametrize("api_key", [None, "invalid-key"])
+def test_incorrect_api_key(api_key: str):
+    global server
+    server.start()
+    res = server.make_request("POST", "/completions", data={
+        "prompt": "I believe the meaning of life is",
+    }, headers={
+        "Authorization": f"Bearer {api_key}" if api_key else None,
+    })
+    assert res.status_code == 401
+    assert "error" in res.body
+    assert res.body["error"]["type"] == "authentication_error"
+
+
+def test_correct_api_key():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completions", data={
+        "prompt": "I believe the meaning of life is",
+    }, headers={
+        "Authorization": f"Bearer {TEST_API_KEY}",
+    })
+    assert res.status_code == 200
+    assert "error" not in res.body
+    assert "content" in res.body
+
+
+def test_correct_api_key_anthropic_header():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completions", data={
+        "prompt": "I believe the meaning of life is",
+    }, headers={
+        "X-Api-Key": TEST_API_KEY,
+    })
+    assert res.status_code == 200
+    assert "error" not in res.body
+    assert "content" in res.body
+
+
+def test_openai_library_correct_api_key():
+    global server
+    server.start()
+    client = OpenAI(api_key=TEST_API_KEY, base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "You are a chatbot."},
+            {"role": "user", "content": "What is the meaning of life?"},
+        ],
+    )
+    assert len(res.choices) == 1
+
+
+@pytest.mark.parametrize("origin,cors_header,cors_header_value", [
+    ("localhost", "Access-Control-Allow-Origin", "localhost"),
+    ("web.mydomain.fr", "Access-Control-Allow-Origin", "web.mydomain.fr"),
+    ("origin", "Access-Control-Allow-Credentials", "true"),
+    ("web.mydomain.fr", "Access-Control-Allow-Methods", "GET, POST"),
+    ("web.mydomain.fr", "Access-Control-Allow-Headers", "*"),
+])
+def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
+    global server
+    server.start()
+    res = server.make_request("OPTIONS", "/completions", headers={
+        "Origin": origin,
+        "Access-Control-Request-Method": "POST",
+        "Access-Control-Request-Headers": "Authorization",
+    })
+    assert res.status_code == 200
+    assert cors_header in res.headers
+    assert res.headers[cors_header] == cors_header_value
+
+
+@pytest.mark.parametrize(
+    "media_path, image_url, success",
+    [
+        (None,             "file://mtmd/test-1.jpeg",    False), # disabled media path, should fail
+        ("../../../tools", "file://mtmd/test-1.jpeg",    True),
+        ("../../../tools", "file:////mtmd//test-1.jpeg", True),  # should be the same file as above
+        ("../../../tools", "file://mtmd/notfound.jpeg",  False), # non-existent file
+        ("../../../tools", "file://../mtmd/test-1.jpeg", False), # no directory traversal
+    ]
+)
+def test_local_media_file(media_path, image_url, success,):
+    server = ServerPreset.tinygemma3()
+    server.media_path = media_path
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 1,
+        "messages": [
+            {"role": "user", "content": [
+                {"type": "text", "text": "test"},
+                {"type": "image_url", "image_url": {
+                    "url": image_url,
+                }},
+            ]},
+        ],
+    })
+    if success:
+        assert res.status_code == 200
+    else:
+        assert res.status_code == 400
diff --git a/llama.cpp/tools/server/tests/unit/test_sleep.py b/llama.cpp/tools/server/tests/unit/test_sleep.py
new file mode 100644
index 0000000..3374165
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_sleep.py
@@ -0,0 +1,39 @@
+import pytest
+import time
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+def test_server_sleep():
+    global server
+    server.sleep_idle_seconds = 1
+    server.start()
+
+    # wait a bit so that server can go to sleep
+    time.sleep(2)
+
+    # make sure these endpoints are still responsive after sleep
+    res = server.make_request("GET", "/health")
+    assert res.status_code == 200
+    res = server.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert res.body["is_sleeping"] == True
+
+    # make a generation request to wake up the server
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 1,
+        "prompt": "Hello",
+    })
+    assert res.status_code == 200
+
+    # it should no longer be sleeping
+    res = server.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert res.body["is_sleeping"] == False
diff --git a/llama.cpp/tools/server/tests/unit/test_slot_save.py b/llama.cpp/tools/server/tests/unit/test_slot_save.py
new file mode 100644
index 0000000..1b428cc
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_slot_save.py
@@ -0,0 +1,98 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.slot_save_path = "./tmp"
+    server.temperature = 0.0
+
+
+def test_slot_save_restore():
+    global server
+    server.start()
+
+    # First prompt in slot 1 should be fully processed
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
+
+    # Save state of slot 1
+    res = server.make_request("POST", "/slots/1?action=save", data={
+        "filename": "slot1.bin",
+    })
+    assert res.status_code == 200
+    assert res.body["n_saved"] == 84
+
+    # Since we have cache, this should only process the last tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 6  # only different part is processed
+
+    # Loading the saved cache into slot 0
+    res = server.make_request("POST", "/slots/0?action=restore", data={
+        "filename": "slot1.bin",
+    })
+    assert res.status_code == 200
+    assert res.body["n_restored"] == 84
+
+    # Since we have cache, slot 0 should only process the last tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 0,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 6  # only different part is processed
+
+    # For verification that slot 1 was not corrupted during slot 0 load, same thing should work
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 1
+
+
+def test_slot_erase():
+    global server
+    server.start()
+
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
+
+    # erase slot 1
+    res = server.make_request("POST", "/slots/1?action=erase")
+    assert res.status_code == 200
+
+    # re-run the same prompt, it should process all tokens again
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
diff --git a/llama.cpp/tools/server/tests/unit/test_speculative.py b/llama.cpp/tools/server/tests/unit/test_speculative.py
new file mode 100644
index 0000000..eebd3cc
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_speculative.py
@@ -0,0 +1,131 @@
+import pytest
+from utils import *
+
+# We use a F16 MOE gguf as main model, and q4_0 as draft model
+
+server = ServerPreset.stories15m_moe()
+
+MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf"
+
+def create_server():
+    global server
+    server = ServerPreset.stories15m_moe()
+    # set default values
+    server.model_draft = download_file(MODEL_DRAFT_FILE_URL)
+    server.draft_min = 4
+    server.draft_max = 8
+    server.fa = "off"
+
+
+@pytest.fixture(autouse=True)
+def fixture_create_server():
+    return create_server()
+
+
+def test_with_and_without_draft():
+    global server
+    server.model_draft = None  # disable draft model
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "temperature": 0.0,
+        "top_k": 1,
+        "n_predict": 16,
+    })
+    assert res.status_code == 200
+    content_no_draft = res.body["content"]
+    server.stop()
+
+    # create new server with draft model
+    create_server()
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "temperature": 0.0,
+        "top_k": 1,
+        "n_predict": 16,
+    })
+    assert res.status_code == 200
+    content_draft = res.body["content"]
+
+    assert content_no_draft == content_draft
+
+
+def test_different_draft_min_draft_max():
+    global server
+    test_values = [
+        (1, 2),
+        (1, 4),
+        (4, 8),
+        (4, 12),
+        (8, 16),
+    ]
+    last_content = None
+    for draft_min, draft_max in test_values:
+        server.stop()
+        server.draft_min = draft_min
+        server.draft_max = draft_max
+        server.start()
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "temperature": 0.0,
+            "top_k": 1,
+            "n_predict": 16,
+        })
+        assert res.status_code == 200
+        if last_content is not None:
+            assert last_content == res.body["content"]
+        last_content = res.body["content"]
+
+
+def test_slot_ctx_not_exceeded():
+    global server
+    server.n_ctx = 256
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "Hello " * 248,
+        "temperature": 0.0,
+        "top_k": 1,
+        "speculative.p_min": 0.0,
+    })
+    assert res.status_code == 200
+    assert len(res.body["content"]) > 0
+
+
+def test_with_ctx_shift():
+    global server
+    server.n_ctx = 256
+    server.enable_ctx_shift = True
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "Hello " * 248,
+        "temperature": 0.0,
+        "top_k": 1,
+        "n_predict": 256,
+        "speculative.p_min": 0.0,
+    })
+    assert res.status_code == 200
+    assert len(res.body["content"]) > 0
+    assert res.body["tokens_predicted"] == 256
+    assert res.body["truncated"] == True
+
+
+@pytest.mark.parametrize("n_slots,n_requests", [
+    (1, 2),
+    (2, 2),
+])
+def test_multi_requests_parallel(n_slots: int, n_requests: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    tasks = []
+    for _ in range(n_requests):
+        tasks.append((server.make_request, ("POST", "/completion", {
+            "prompt": "I believe the meaning of life is",
+            "temperature": 0.0,
+            "top_k": 1,
+        })))
+    results = parallel_function_calls(tasks)
+    for res in results:
+        assert res.status_code == 200
+        assert match_regex("(wise|kind|owl|answer)+", res.body["content"])
diff --git a/llama.cpp/tools/server/tests/unit/test_template.py b/llama.cpp/tools/server/tests/unit/test_template.py
new file mode 100644
index 0000000..e5185fc
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_template.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+import pytest
+
+# ensure grandparent path is in sys.path
+from pathlib import Path
+import sys
+
+from unit.test_tool_call import TEST_TOOL
+path = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(path))
+
+import datetime
+from utils import *
+
+server: ServerProcess
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.model_alias = "tinyllama-2"
+    server.n_slots = 1
+
+
+@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
+@pytest.mark.parametrize("template_name,reasoning_budget,expected_end", [
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", None, "<think>\n"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B",   -1, "<think>\n"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B",    0, "<think>\n</think>"),
+
+    ("Qwen-Qwen3-0.6B", -1, "<|im_start|>assistant\n"),
+    ("Qwen-Qwen3-0.6B",  0, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
+
+    ("Qwen-QwQ-32B", -1, "<|im_start|>assistant\n<think>\n"),
+    ("Qwen-QwQ-32B",  0, "<|im_start|>assistant\n<think>\n</think>"),
+
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", -1, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use",  0, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
+])
+def test_reasoning_budget(template_name: str, reasoning_budget: int | None, expected_end: str, tools: list[dict]):
+    global server
+    server.jinja = True
+    server.reasoning_budget = reasoning_budget
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start()
+
+    res = server.make_request("POST", "/apply-template", data={
+        "messages": [
+            {"role": "user", "content": "What is today?"},
+        ],
+        "tools": tools,
+    })
+    assert res.status_code == 200
+    prompt = res.body["prompt"]
+
+    assert prompt.endswith(expected_end), f"Expected prompt to end with '{expected_end}', got '{prompt}'"
+
+
+@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
+@pytest.mark.parametrize("template_name,format", [
+    ("meta-llama-Llama-3.3-70B-Instruct",    "%d %b %Y"),
+    ("fireworks-ai-llama-3-firefunction-v2", "%b %d %Y"),
+])
+def test_date_inside_prompt(template_name: str, format: str, tools: list[dict]):
+    global server
+    server.jinja = True
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start()
+
+    res = server.make_request("POST", "/apply-template", data={
+        "messages": [
+            {"role": "user", "content": "What is today?"},
+        ],
+        "tools": tools,
+    })
+    assert res.status_code == 200
+    prompt = res.body["prompt"]
+
+    today_str = datetime.date.today().strftime(format)
+    assert today_str in prompt, f"Expected today's date ({today_str}) in content ({prompt})"
+
+
+@pytest.mark.parametrize("add_generation_prompt", [False, True])
+@pytest.mark.parametrize("template_name,expected_generation_prompt", [
+    ("meta-llama-Llama-3.3-70B-Instruct",    "<|start_header_id|>assistant<|end_header_id|>"),
+])
+def test_add_generation_prompt(template_name: str, expected_generation_prompt: str, add_generation_prompt: bool):
+    global server
+    server.jinja = True
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start()
+
+    res = server.make_request("POST", "/apply-template", data={
+        "messages": [
+            {"role": "user", "content": "What is today?"},
+        ],
+        "add_generation_prompt": add_generation_prompt,
+    })
+    assert res.status_code == 200
+    prompt = res.body["prompt"]
+
+    if add_generation_prompt:
+        assert expected_generation_prompt in prompt, f"Expected generation prompt ({expected_generation_prompt}) in content ({prompt})"
+    else:
+        assert expected_generation_prompt not in prompt, f"Did not expect generation prompt ({expected_generation_prompt}) in content ({prompt})"
diff --git a/llama.cpp/tools/server/tests/unit/test_tokenize.py b/llama.cpp/tools/server/tests/unit/test_tokenize.py
new file mode 100644
index 0000000..424cac5
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_tokenize.py
@@ -0,0 +1,59 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+def test_tokenize_detokenize():
+    global server
+    server.start()
+    # tokenize
+    content = "What is the capital of France ?"
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content
+    })
+    assert res_tok.status_code == 200
+    assert len(res_tok.body["tokens"]) > 5
+    # detokenize
+    res_detok = server.make_request("POST", "/detokenize", data={
+        "tokens": res_tok.body["tokens"],
+    })
+    assert res_detok.status_code == 200
+    assert res_detok.body["content"].strip() == content
+
+
+def test_tokenize_with_bos():
+    global server
+    server.start()
+    # tokenize
+    content = "What is the capital of France ?"
+    bosId = 1
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content,
+        "add_special": True,
+    })
+    assert res_tok.status_code == 200
+    assert res_tok.body["tokens"][0] == bosId
+
+
+def test_tokenize_with_pieces():
+    global server
+    server.start()
+    # tokenize
+    content = "This is a test string with unicode 媽 and emoji 🤗"
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content,
+        "with_pieces": True,
+    })
+    assert res_tok.status_code == 200
+    for token in res_tok.body["tokens"]:
+        assert "id" in token
+        assert token["id"] > 0
+        assert "piece" in token
+        assert len(token["piece"]) > 0
diff --git a/llama.cpp/tools/server/tests/unit/test_tool_call.py b/llama.cpp/tools/server/tests/unit/test_tool_call.py
new file mode 100755
index 0000000..b8f0f10
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_tool_call.py
@@ -0,0 +1,625 @@
+#!/usr/bin/env python
+import pytest
+
+# ensure grandparent path is in sys.path
+from pathlib import Path
+import sys
+path = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(path))
+
+from utils import *
+from enum import Enum
+
+server: ServerProcess
+
+TIMEOUT_START_SLOW = 15 * 60 # this is needed for real model tests
+TIMEOUT_HTTP_REQUEST = 60
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.model_alias = "tinyllama-2-tool-call"
+    server.server_port = 8081
+    server.n_slots = 1
+    server.n_ctx = 8192
+    server.n_batch = 2048
+
+class CompletionMode(Enum):
+    NORMAL = "normal"
+    STREAMED = "streamed"
+
+TEST_TOOL = {
+    "type":"function",
+    "function": {
+        "name": "test",
+        "description": "",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "success": {"type": "boolean", "const": True},
+            },
+            "required": ["success"]
+        }
+    }
+}
+
+PYTHON_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "python",
+        "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "code": {
+                    "type": "string",
+                    "description": "The code to run in the ipython interpreter."
+                }
+            },
+            "required": ["code"]
+        }
+    }
+}
+
+WEATHER_TOOL = {
+  "type":"function",
+  "function":{
+    "name":"get_current_weather",
+    "description":"Get the current weather in a given location",
+    "parameters":{
+      "type":"object",
+      "properties":{
+        "location":{
+          "type":"string",
+          "description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'"
+        }
+      },
+      "required":["location"]
+    }
+  }
+}
+
+def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs):
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
+        "max_tokens": n_predict,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "Write an example"},
+        ],
+        "tool_choice": "required",
+        "tools": [tool],
+        "parallel_tool_calls": False,
+        **kwargs,
+    })
+    # assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
+    choice = body["choices"][0]
+    tool_calls = choice["message"].get("tool_calls")
+    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
+    tool_call = tool_calls[0]
+    assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
+    # assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
+    expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
+    assert expected_function_name == tool_call["function"]["name"]
+    actual_arguments = tool_call["function"]["arguments"]
+    assert isinstance(actual_arguments, str)
+    if argument_key is not None:
+        actual_arguments = json.loads(actual_arguments)
+        assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
+
+
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+@pytest.mark.parametrize("template_name,tool,argument_key", [
+    ("google-gemma-2-2b-it",                          TEST_TOOL,            "success"),
+    ("google-gemma-2-2b-it",                          TEST_TOOL,            "success"),
+    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
+    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
+    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
+    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
+])
+def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
+    global server
+    n_predict = 1024
+    # server = ServerPreset.stories15m_moe()
+    server.jinja = True
+    server.n_predict = n_predict
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start()
+    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+@pytest.mark.parametrize("template_name,tool,argument_key", [
+    ("meta-llama-Llama-3.1-8B-Instruct",              TEST_TOOL,            "success"),
+    ("meta-llama-Llama-3.1-8B-Instruct",              PYTHON_TOOL,          "code"),
+
+    ("meetkai-functionary-medium-v3.1",               TEST_TOOL,            "success"),
+    ("meetkai-functionary-medium-v3.1",               PYTHON_TOOL,          "code"),
+
+    ("meetkai-functionary-medium-v3.2",               TEST_TOOL,            "success"),
+    # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own.
+    # ("meetkai-functionary-medium-v3.2",               PYTHON_TOOL,          "code"),
+
+    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL,            "success"),
+    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL,          "code"),
+
+    ("meta-llama-Llama-3.2-3B-Instruct",              TEST_TOOL,            "success"),
+    ("meta-llama-Llama-3.2-3B-Instruct",              PYTHON_TOOL,          "code"),
+
+    ("mistralai-Mistral-Nemo-Instruct-2407",          TEST_TOOL,            "success"),
+    ("mistralai-Mistral-Nemo-Instruct-2407",          PYTHON_TOOL,          "code"),
+
+    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   TEST_TOOL,            "success"),
+    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   PYTHON_TOOL,          "code"),
+
+    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),
+
+    ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
+    # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "codeFalse), True),
+    # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
+
+])
+def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
+    global server
+    n_predict = 512
+    # server = ServerPreset.stories15m_moe()
+    server.jinja = True
+    server.n_predict = n_predict
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start(timeout_seconds=TIMEOUT_START_SLOW)
+    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+@pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [
+    (TEST_TOOL,    "success",  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    (PYTHON_TOOL,  "code",     "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    (PYTHON_TOOL,  "code",     "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   "chatml"),
+
+    # (TEST_TOOL,    "success",  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    # (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    # (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       ("meetkai/functionary-medium-v3.2", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       ("meetkai/functionary-medium-v3.2", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (PYTHON_TOOL,  "code",     "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+])
+def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
+    global server
+    n_predict = 512
+    server.jinja = True
+    server.n_ctx = 8192
+    server.n_predict = n_predict
+    server.model_hf_repo = hf_repo
+    server.model_hf_file = None
+    if isinstance(template_override, tuple):
+        (template_hf_repo, template_variant) = template_override
+        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
+        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
+    server.start(timeout_seconds=TIMEOUT_START_SLOW)
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
+        "max_tokens": n_predict,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "Write an example"},
+        ],
+        "tool_choice": "required",
+        "tools": [tool],
+        "parallel_tool_calls": False,
+        "stream": stream == CompletionMode.STREAMED,
+        "temperature": 0.0,
+        "top_k": 1,
+        "top_p": 1.0,
+    }, timeout=TIMEOUT_HTTP_REQUEST)
+    choice = body["choices"][0]
+    tool_calls = choice["message"].get("tool_calls")
+    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
+    tool_call = tool_calls[0]
+    # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
+    expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
+    assert expected_function_name == tool_call["function"]["name"]
+    actual_arguments = tool_call["function"]["arguments"]
+    assert isinstance(actual_arguments, str)
+    if argument_key is not None:
+        actual_arguments = json.loads(actual_arguments)
+        assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
+
+
+def do_test_completion_without_tool_call(server: ServerProcess, n_predict: int, tools: list[dict], tool_choice: str | None, **kwargs):
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
+        "max_tokens": n_predict,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "say hello world with python"},
+        ],
+        "tools": tools if tools else None,
+        "tool_choice": tool_choice,
+        **kwargs,
+    }, timeout=TIMEOUT_HTTP_REQUEST)
+    choice = body["choices"][0]
+    assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
+
+
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
+    ("meta-llama-Llama-3.3-70B-Instruct",         128, [],            None),
+    ("meta-llama-Llama-3.3-70B-Instruct",         128, [TEST_TOOL],   None),
+    ("meta-llama-Llama-3.3-70B-Instruct",         128, [PYTHON_TOOL], 'none'),
+])
+def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None, stream: CompletionMode):
+    global server
+    server.n_predict = n_predict
+    server.jinja = True
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start()
+    do_test_completion_without_tool_call(server, n_predict, tools, tool_choice, stream=stream == CompletionMode.STREAMED)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
+    ("meetkai-functionary-medium-v3.2",               256, [],            None),
+    ("meetkai-functionary-medium-v3.2",               256, [TEST_TOOL],   None),
+    ("meetkai-functionary-medium-v3.2",               256, [PYTHON_TOOL], 'none'),
+    ("meetkai-functionary-medium-v3.1",               256, [],            None),
+    ("meetkai-functionary-medium-v3.1",               256, [TEST_TOOL],   None),
+    ("meetkai-functionary-medium-v3.1",               256, [PYTHON_TOOL], 'none'),
+    ("meta-llama-Llama-3.2-3B-Instruct",              256, [],            None),
+    ("meta-llama-Llama-3.2-3B-Instruct",              256, [TEST_TOOL],   None),
+    ("meta-llama-Llama-3.2-3B-Instruct",              256, [PYTHON_TOOL], 'none'),
+])
+def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None, stream: CompletionMode):
+    global server
+    server.n_predict = n_predict
+    server.jinja = True
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start(timeout_seconds=TIMEOUT_START_SLOW)
+    do_test_completion_without_tool_call(server, n_predict, tools, tool_choice, stream=stream == CompletionMode.STREAMED)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+@pytest.mark.parametrize("hf_repo,template_override", [
+    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+
+    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+
+    ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
+    ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    ("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      None),
+    ("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+
+    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+
+    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+
+    # ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    # ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+
+    # ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
+    # ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+
+    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L",   ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
+
+    ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+
+    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
+    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+
+    # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
+])
+def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
+    global server
+    n_predict = 512
+    server.jinja = True
+    server.n_ctx = 8192
+    server.n_predict = n_predict
+    server.model_hf_repo = hf_repo
+    server.model_hf_file = None
+    if isinstance(template_override, tuple):
+        (template_hf_repo, template_variant) = template_override
+        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
+        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
+    server.start()
+    do_test_weather(server, stream=stream == CompletionMode.STREAMED, max_tokens=n_predict)
+
+
+def do_test_weather(server: ServerProcess, **kwargs):
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
+        "messages": [
+            {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
+            {"role": "user", "content": "What is the weather in Istanbul?"},
+        ],
+        "tools": [WEATHER_TOOL],
+        **kwargs,
+    }, timeout=TIMEOUT_HTTP_REQUEST)
+    choice = body["choices"][0]
+    tool_calls = choice["message"].get("tool_calls")
+    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
+    tool_call = tool_calls[0]
+    # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
+    assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}'
+    # assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
+    actual_arguments = json.loads(tool_call["function"]["arguments"])
+    assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
+    location = actual_arguments["location"]
+    assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}"
+    assert re.match('^Istanbul(( |, ?)(TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}'
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+@pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
+    (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       "chatml"),
+    (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
+    (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
+    (None,                                           128,  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",         "chatml"),
+    (None,                                           128,  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",     ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (None,                                           128,  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",       ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    (None,                                           128,  "bartowski/functionary-small-v3.2-GGUF:Q8_0",        ("meetkai/functionary-medium-v3.2", None)),
+    (None,                                           128,  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M",  None),
+    (None,                                           128,  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M",  "chatml"),
+    (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
+
+    # TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value)
+    # (None,                                           128,  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M",  None),
+    # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+])
+def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
+    global server
+    server.jinja = True
+    server.n_ctx = 8192 * 2
+    server.n_predict = n_predict
+    server.model_hf_repo = hf_repo
+    server.model_hf_file = None
+    if isinstance(template_override, tuple):
+        (template_hf_repo, template_variant) = template_override
+        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
+        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
+    server.start(timeout_seconds=TIMEOUT_START_SLOW)
+    do_test_calc_result(server, result_override, n_predict, stream=stream == CompletionMode.STREAMED)
+
+
+def do_test_calc_result(server: ServerProcess, result_override: str | None, n_predict: int, **kwargs):
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
+        "max_tokens": n_predict,
+        "messages": [
+            {"role": "system", "content": "You are a tools-calling assistant. You express numerical values with at most two decimals."},
+            {"role": "user", "content": "What's the y coordinate of a point on the unit sphere at angle 30 degrees?"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "id": "call_6789",
+                        "type": "function",
+                        "function": {
+                            "name": "calculate",
+                            "arguments": "{\"expression\":\"sin(30 * pi / 180)\"}"
+                        }
+                    }
+                ]
+            },
+            {
+                "role": "tool",
+                "name": "calculate",
+                "content": "0.55644242476",
+                "tool_call_id": "call_6789"
+            }
+        ],
+        "tools": [
+            {
+                "type":"function",
+                "function":{
+                    "name":"calculate",
+                    "description":"A calculator function that computes values of arithmetic expressions in the Python syntax",
+                    "parameters":{
+                        "type":"object",
+                        "properties":{
+                            "expression":{
+                            "type":"string",
+                            "description":"An arithmetic expression to compute the value of (Python syntad, assuming all floats)"
+                            }
+                        },
+                        "required":["expression"]
+                    }
+                }
+            }
+        ],
+        **kwargs,
+    }, timeout=TIMEOUT_HTTP_REQUEST)
+    choice = body["choices"][0]
+    tool_calls = choice["message"].get("tool_calls")
+    assert tool_calls is None, f'Expected no tool call in {choice["message"]}'
+    content = choice["message"].get("content")
+    assert content is not None, f'Expected content in {choice["message"]}'
+    if result_override is not None:
+        assert re.match(result_override, content), f'Expected {result_override}, got {content}'
+    else:
+        assert re.match('^[\\s\\S]*?((That\'s|\\bis) (approximately )?)?\\b0\\.(5\\b|56\\b|556)', content), \
+            f'Expected something like "The y coordinate is 0.56.", got {content}'
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+@pytest.mark.parametrize("n_predict,reasoning_format,expect_reasoning_content,expect_content,hf_repo,template_override", [
+    (128, 'deepseek',   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (128,  None,        None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (1024, 'deepseek',  "I need to calculate the sum of 102 and 7[\\s\\S]*", "To find the sum of[\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (1024, 'deepseek',  "First, I [\\s\\S]*", "To find the sum of[\\s\\S]*",                                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
+    # (1024, 'none',      CompletionMode.NORMAL,   None, "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*",                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    # (128,  'deepseek',  None, "^Okay, let me figure out the sum of 102 and 7[\\s\\S]*",                      "bartowski/Qwen_QwQ-32B-GGUF:Q4_K_M",                None),
+])
+def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
+    global server
+    server.reasoning_format = reasoning_format
+    server.jinja = True
+    server.n_ctx = 8192 * 2
+    server.n_predict = n_predict
+    server.model_hf_repo = hf_repo
+    server.model_hf_file = None
+    if isinstance(template_override, tuple):
+        (template_hf_repo, template_variant) = template_override
+        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
+        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
+    server.start()
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
+        "max_tokens": n_predict,
+        "messages": [
+            {"role": "user", "content": "What's the sum of 102 and 7?"},
+        ],
+        "stream": stream == CompletionMode.STREAMED,
+    }, timeout=TIMEOUT_HTTP_REQUEST)
+    choice = body["choices"][0]
+    assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
+
+    content = choice["message"].get("content")
+    if expect_content is None:
+        assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
+    else:
+        assert re.match(expect_content, content), f'Expected {expect_content}, got {content}'
+
+    reasoning_content = choice["message"].get("reasoning_content")
+    if expect_reasoning_content is None:
+        assert reasoning_content is None, f'Expected no reasoning content in {choice["message"]}'
+    else:
+        assert re.match(expect_reasoning_content, reasoning_content), f'Expected {expect_reasoning_content}, got {reasoning_content}'
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+@pytest.mark.parametrize("hf_repo,template_override", [
+    ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+
+    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+
+    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
+    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+
+    # ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+
+    ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      None),
+
+    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      None),
+
+    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+
+    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+
+    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
+    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+
+    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+
+    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              "chatml"),
+])
+def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
+    global server
+    n_predict = 512 # High because of DeepSeek R1
+    server.jinja = True
+    server.n_ctx = 8192
+    server.n_predict = n_predict
+    server.model_hf_repo = hf_repo
+    server.model_hf_file = None
+    if isinstance(template_override, tuple):
+        (template_hf_repo, template_variant) = template_override
+        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
+        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
+    server.start(timeout_seconds=TIMEOUT_START_SLOW)
+
+    do_test_hello_world(server, stream=stream == CompletionMode.STREAMED, max_tokens=n_predict)
+
+
+def do_test_hello_world(server: ServerProcess, **kwargs):
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
+        "messages": [
+            {"role": "system", "content": "You are a tool-calling agent."},
+            {"role": "user", "content": "say hello world with python"},
+        ],
+        "tools": [PYTHON_TOOL],
+        **kwargs,
+    }, timeout=TIMEOUT_HTTP_REQUEST)
+    choice = body["choices"][0]
+    tool_calls = choice["message"].get("tool_calls")
+    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
+    tool_call = tool_calls[0]
+    # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
+    assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
+    # assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
+    actual_arguments = json.loads(tool_call["function"]["arguments"])
+    assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
+    code = actual_arguments["code"]
+    assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
+    assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', re.sub(r'#.*\n?', '', code)), f'Expected hello world, got {code}'
diff --git a/llama.cpp/tools/server/tests/unit/test_vision_api.py b/llama.cpp/tools/server/tests/unit/test_vision_api.py
new file mode 100644
index 0000000..9408116
--- /dev/null
+++ b/llama.cpp/tools/server/tests/unit/test_vision_api.py
@@ -0,0 +1,160 @@
+import pytest
+from utils import *
+import base64
+import requests
+
+server: ServerProcess
+
+def get_img_url(id: str) -> str:
+    IMG_URL_0 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/11_truck.png"
+    IMG_URL_1 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/91_cat.png"
+    if id == "IMG_URL_0":
+        return IMG_URL_0
+    elif id == "IMG_URL_1":
+        return IMG_URL_1
+    elif id == "IMG_BASE64_URI_0":
+        response = requests.get(IMG_URL_0)
+        response.raise_for_status() # Raise an exception for bad status codes
+        return "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
+    elif id == "IMG_BASE64_0":
+        response = requests.get(IMG_URL_0)
+        response.raise_for_status() # Raise an exception for bad status codes
+        return base64.b64encode(response.content).decode("utf-8")
+    elif id == "IMG_BASE64_URI_1":
+        response = requests.get(IMG_URL_1)
+        response.raise_for_status() # Raise an exception for bad status codes
+        return "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
+    elif id == "IMG_BASE64_1":
+        response = requests.get(IMG_URL_1)
+        response.raise_for_status() # Raise an exception for bad status codes
+        return base64.b64encode(response.content).decode("utf-8")
+    else:
+        return id
+
+JSON_MULTIMODAL_KEY = "multimodal_data"
+JSON_PROMPT_STRING_KEY = "prompt_string"
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinygemma3()
+
+def test_models_supports_multimodal_capability():
+    global server
+    server.start()
+    res = server.make_request("GET", "/models", data={})
+    assert res.status_code == 200
+    model_info = res.body["models"][0]
+    print(model_info)
+    assert "completion" in model_info["capabilities"]
+    assert "multimodal" in model_info["capabilities"]
+
+def test_v1_models_supports_multimodal_capability():
+    global server
+    server.start()
+    res = server.make_request("GET", "/v1/models", data={})
+    assert res.status_code == 200
+    model_info = res.body["models"][0]
+    print(model_info)
+    assert "completion" in model_info["capabilities"]
+    assert "multimodal" in model_info["capabilities"]
+
+@pytest.mark.parametrize(
+    "prompt, image_url, success, re_content",
+    [
+        # test model is trained on CIFAR-10, but it's quite dumb due to small size
+        ("What is this:\n", "IMG_URL_0",              True, "(cat)+"),
+        ("What is this:\n", "IMG_BASE64_URI_0",       True, "(cat)+"),
+        ("What is this:\n", "IMG_URL_1",              True, "(frog)+"),
+        ("Test test\n",     "IMG_URL_1",              True, "(frog)+"), # test invalidate cache
+        ("What is this:\n", "malformed",              False, None),
+        ("What is this:\n", "https://google.com/404", False, None), # non-existent image
+        ("What is this:\n", "https://ggml.ai",        False, None), # non-image data
+        # TODO @ngxson : test with multiple images, no images and with audio
+    ]
+)
+def test_vision_chat_completion(prompt, image_url, success, re_content):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "temperature": 0.0,
+        "top_k": 1,
+        "messages": [
+            {"role": "user", "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {
+                    "url": get_img_url(image_url),
+                }},
+            ]},
+        ],
+    })
+    if success:
+        assert res.status_code == 200
+        choice = res.body["choices"][0]
+        assert "assistant" == choice["message"]["role"]
+        assert match_regex(re_content, choice["message"]["content"])
+    else:
+        assert res.status_code != 200
+
+
+@pytest.mark.parametrize(
+    "prompt, image_data, success, re_content",
+    [
+        # test model is trained on CIFAR-10, but it's quite dumb due to small size
+        ("What is this: <__media__>\n", "IMG_BASE64_0",         True, "(cat)+"),
+        ("What is this: <__media__>\n", "IMG_BASE64_1",         True, "(frog)+"),
+        ("What is this: <__media__>\n", "malformed",            False, None), # non-image data
+        ("What is this:\n",             "",                     False, None), # empty string
+    ]
+)
+def test_vision_completion(prompt, image_data, success, re_content):
+    global server
+    server.start()
+    res = server.make_request("POST", "/completions", data={
+        "temperature": 0.0,
+        "top_k": 1,
+        "prompt": {
+            JSON_PROMPT_STRING_KEY: prompt,
+            JSON_MULTIMODAL_KEY: [ get_img_url(image_data) ],
+        },
+    })
+    if success:
+        assert res.status_code == 200
+        content = res.body["content"]
+        assert match_regex(re_content, content)
+    else:
+        assert res.status_code != 200
+
+
+@pytest.mark.parametrize(
+    "prompt, image_data, success",
+    [
+        # test model is trained on CIFAR-10, but it's quite dumb due to small size
+        ("What is this: <__media__>\n", "IMG_BASE64_0",         True),
+        ("What is this: <__media__>\n", "IMG_BASE64_1",         True),
+        ("What is this: <__media__>\n", "malformed",            False), # non-image data
+        ("What is this:\n",             "base64",               False), # non-image data
+    ]
+)
+def test_vision_embeddings(prompt, image_data, success):
+    global server
+    server.server_embeddings = True
+    server.n_batch = 512
+    server.start()
+    image_data = get_img_url(image_data)
+    res = server.make_request("POST", "/embeddings", data={
+        "content": [
+            { JSON_PROMPT_STRING_KEY: prompt, JSON_MULTIMODAL_KEY: [ image_data ] },
+            { JSON_PROMPT_STRING_KEY: prompt, JSON_MULTIMODAL_KEY: [ image_data ] },
+            { JSON_PROMPT_STRING_KEY: prompt, },
+        ],
+    })
+    if success:
+        assert res.status_code == 200
+        content = res.body
+        # Ensure embeddings are stable when multimodal.
+        assert content[0]['embedding'] == content[1]['embedding']
+        # Ensure embeddings without multimodal but same prompt do not match multimodal embeddings.
+        assert content[0]['embedding'] != content[2]['embedding']
+    else:
+        assert res.status_code != 200
diff --git a/llama.cpp/tools/server/tests/utils.py b/llama.cpp/tools/server/tests/utils.py
new file mode 100644
index 0000000..f76bb1a
--- /dev/null
+++ b/llama.cpp/tools/server/tests/utils.py
@@ -0,0 +1,643 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# type: ignore[reportUnusedImport]
+
+import subprocess
+import os
+import re
+import json
+from json import JSONDecodeError
+import sys
+import requests
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Tuple,
+    Set,
+)
+from re import RegexFlag
+import wget
+
+
+DEFAULT_HTTP_TIMEOUT = 60
+
+
+class ServerResponse:
+    headers: dict
+    status_code: int
+    body: dict | Any
+
+
+class ServerError(Exception):
+    def __init__(self, code, body):
+        self.code = code
+        self.body = body
+
+
+class ServerProcess:
+    # default options
+    debug: bool = False
+    server_port: int = 8080
+    server_host: str = "127.0.0.1"
+    model_hf_repo: str | None = "ggml-org/models"
+    model_hf_file: str | None = "tinyllamas/stories260K.gguf"
+    model_alias: str = "tinyllama-2"
+    temperature: float = 0.8
+    seed: int = 42
+    offline: bool = False
+
+    # custom options
+    model_alias: str | None = None
+    model_url: str | None = None
+    model_file: str | None = None
+    model_draft: str | None = None
+    n_threads: int | None = None
+    n_gpu_layer: int | None = None
+    n_batch: int | None = None
+    n_ubatch: int | None = None
+    n_ctx: int | None = None
+    n_ga: int | None = None
+    n_ga_w: int | None = None
+    n_predict: int | None = None
+    n_prompts: int | None = 0
+    slot_save_path: str | None = None
+    id_slot: int | None = None
+    cache_prompt: bool | None = None
+    n_slots: int | None = None
+    ctk: str | None = None
+    ctv: str | None = None
+    fa: str | None = None
+    server_continuous_batching: bool | None = False
+    server_embeddings: bool | None = False
+    server_reranking: bool | None = False
+    server_metrics: bool | None = False
+    kv_unified: bool | None = False
+    server_slots: bool | None = False
+    pooling: str | None = None
+    draft: int | None = None
+    api_key: str | None = None
+    models_dir: str | None = None
+    models_max: int | None = None
+    no_models_autoload: bool | None = None
+    lora_files: List[str] | None = None
+    enable_ctx_shift: int | None = False
+    draft_min: int | None = None
+    draft_max: int | None = None
+    no_webui: bool | None = None
+    jinja: bool | None = None
+    reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None
+    reasoning_budget: int | None = None
+    chat_template: str | None = None
+    chat_template_file: str | None = None
+    server_path: str | None = None
+    mmproj_url: str | None = None
+    media_path: str | None = None
+    sleep_idle_seconds: int | None = None
+
+    # session variables
+    process: subprocess.Popen | None = None
+
+    def __init__(self):
+        if "N_GPU_LAYERS" in os.environ:
+            self.n_gpu_layer = int(os.environ["N_GPU_LAYERS"])
+        if "DEBUG" in os.environ:
+            self.debug = True
+        if "PORT" in os.environ:
+            self.server_port = int(os.environ["PORT"])
+        self.external_server = "DEBUG_EXTERNAL" in os.environ
+
+    def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
+        if self.external_server:
+            print(f"[external_server]: Assuming external server running on {self.server_host}:{self.server_port}")
+            return
+        if self.server_path is not None:
+            server_path = self.server_path
+        elif "LLAMA_SERVER_BIN_PATH" in os.environ:
+            server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
+        elif os.name == "nt":
+            server_path = "../../../build/bin/Release/llama-server.exe"
+        else:
+            server_path = "../../../build/bin/llama-server"
+        server_args = [
+            "--host",
+            self.server_host,
+            "--port",
+            self.server_port,
+            "--temp",
+            self.temperature,
+            "--seed",
+            self.seed,
+        ]
+        if self.offline:
+            server_args.append("--offline")
+        if self.model_file:
+            server_args.extend(["--model", self.model_file])
+        if self.model_url:
+            server_args.extend(["--model-url", self.model_url])
+        if self.model_draft:
+            server_args.extend(["--model-draft", self.model_draft])
+        if self.model_hf_repo:
+            server_args.extend(["--hf-repo", self.model_hf_repo])
+        if self.model_hf_file:
+            server_args.extend(["--hf-file", self.model_hf_file])
+        if self.models_dir:
+            server_args.extend(["--models-dir", self.models_dir])
+        if self.models_max is not None:
+            server_args.extend(["--models-max", self.models_max])
+        if self.n_batch:
+            server_args.extend(["--batch-size", self.n_batch])
+        if self.n_ubatch:
+            server_args.extend(["--ubatch-size", self.n_ubatch])
+        if self.n_threads:
+            server_args.extend(["--threads", self.n_threads])
+        if self.n_gpu_layer:
+            server_args.extend(["--n-gpu-layers", self.n_gpu_layer])
+        if self.draft is not None:
+            server_args.extend(["--draft", self.draft])
+        if self.server_continuous_batching:
+            server_args.append("--cont-batching")
+        if self.server_embeddings:
+            server_args.append("--embedding")
+        if self.server_reranking:
+            server_args.append("--reranking")
+        if self.server_metrics:
+            server_args.append("--metrics")
+        if self.kv_unified:
+            server_args.append("--kv-unified")
+        if self.server_slots:
+            server_args.append("--slots")
+        else:
+            server_args.append("--no-slots")
+        if self.pooling:
+            server_args.extend(["--pooling", self.pooling])
+        if self.model_alias:
+            server_args.extend(["--alias", self.model_alias])
+        if self.n_ctx:
+            server_args.extend(["--ctx-size", self.n_ctx])
+        if self.n_slots:
+            server_args.extend(["--parallel", self.n_slots])
+        if self.ctk:
+            server_args.extend(["-ctk", self.ctk])
+        if self.ctv:
+            server_args.extend(["-ctv", self.ctv])
+        if self.fa is not None:
+            server_args.extend(["-fa", self.fa])
+        if self.n_predict:
+            server_args.extend(["--n-predict", self.n_predict])
+        if self.slot_save_path:
+            server_args.extend(["--slot-save-path", self.slot_save_path])
+        if self.n_ga:
+            server_args.extend(["--grp-attn-n", self.n_ga])
+        if self.n_ga_w:
+            server_args.extend(["--grp-attn-w", self.n_ga_w])
+        if self.debug:
+            server_args.append("--verbose")
+        if self.lora_files:
+            for lora_file in self.lora_files:
+                server_args.extend(["--lora", lora_file])
+        if self.enable_ctx_shift:
+            server_args.append("--context-shift")
+        if self.api_key:
+            server_args.extend(["--api-key", self.api_key])
+        if self.draft_max:
+            server_args.extend(["--draft-max", self.draft_max])
+        if self.draft_min:
+            server_args.extend(["--draft-min", self.draft_min])
+        if self.no_webui:
+            server_args.append("--no-webui")
+        if self.no_models_autoload:
+            server_args.append("--no-models-autoload")
+        if self.jinja:
+            server_args.append("--jinja")
+        else:
+            server_args.append("--no-jinja")
+        if self.reasoning_format is not None:
+            server_args.extend(("--reasoning-format", self.reasoning_format))
+        if self.reasoning_budget is not None:
+            server_args.extend(("--reasoning-budget", self.reasoning_budget))
+        if self.chat_template:
+            server_args.extend(["--chat-template", self.chat_template])
+        if self.chat_template_file:
+            server_args.extend(["--chat-template-file", self.chat_template_file])
+        if self.mmproj_url:
+            server_args.extend(["--mmproj-url", self.mmproj_url])
+        if self.media_path:
+            server_args.extend(["--media-path", self.media_path])
+        if self.sleep_idle_seconds is not None:
+            server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
+
+        args = [str(arg) for arg in [server_path, *server_args]]
+        print(f"tests: starting server with: {' '.join(args)}")
+
+        flags = 0
+        if "nt" == os.name:
+            flags |= subprocess.DETACHED_PROCESS
+            flags |= subprocess.CREATE_NEW_PROCESS_GROUP
+            flags |= subprocess.CREATE_NO_WINDOW
+
+        self.process = subprocess.Popen(
+            [str(arg) for arg in [server_path, *server_args]],
+            creationflags=flags,
+            stdout=sys.stdout,
+            stderr=sys.stdout,
+            env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
+        )
+        server_instances.add(self)
+
+        print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
+
+        # wait for server to start
+        start_time = time.time()
+        while time.time() - start_time < timeout_seconds:
+            try:
+                response = self.make_request("GET", "/health", headers={
+                    "Authorization": f"Bearer {self.api_key}" if self.api_key else None
+                })
+                if response.status_code == 200:
+                    self.ready = True
+                    return  # server is ready
+            except Exception as e:
+                pass
+            # Check if process died
+            if self.process.poll() is not None:
+                raise RuntimeError(f"Server process died with return code {self.process.returncode}")
+
+            print(f"Waiting for server to start...")
+            time.sleep(0.5)
+        raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
+
+    def stop(self) -> None:
+        if self.external_server:
+            print("[external_server]: Not stopping external server")
+            return
+        if self in server_instances:
+            server_instances.remove(self)
+        if self.process:
+            print(f"Stopping server with pid={self.process.pid}")
+            self.process.kill()
+            self.process = None
+
+    def make_request(
+        self,
+        method: str,
+        path: str,
+        data: dict | Any | None = None,
+        headers: dict | None = None,
+        timeout: float | None = None,
+    ) -> ServerResponse:
+        url = f"http://{self.server_host}:{self.server_port}{path}"
+        parse_body = False
+        if method == "GET":
+            response = requests.get(url, headers=headers, timeout=timeout)
+            parse_body = True
+        elif method == "POST":
+            response = requests.post(url, headers=headers, json=data, timeout=timeout)
+            parse_body = True
+        elif method == "OPTIONS":
+            response = requests.options(url, headers=headers, timeout=timeout)
+        else:
+            raise ValueError(f"Unimplemented method: {method}")
+        result = ServerResponse()
+        result.headers = dict(response.headers)
+        result.status_code = response.status_code
+        if parse_body:
+            try:
+                result.body = response.json()
+            except JSONDecodeError:
+                result.body = response.text
+        else:
+            result.body = None
+        print("Response from server", json.dumps(result.body, indent=2))
+        return result
+
+    def make_stream_request(
+        self,
+        method: str,
+        path: str,
+        data: dict | None = None,
+        headers: dict | None = None,
+    ) -> Iterator[dict]:
+        url = f"http://{self.server_host}:{self.server_port}{path}"
+        if method == "POST":
+            response = requests.post(url, headers=headers, json=data, stream=True)
+        else:
+            raise ValueError(f"Unimplemented method: {method}")
+        if response.status_code != 200:
+            raise ServerError(response.status_code, response.json())
+        for line_bytes in response.iter_lines():
+            line = line_bytes.decode("utf-8")
+            if '[DONE]' in line:
+                break
+            elif line.startswith('data: '):
+                data = json.loads(line[6:])
+                print("Partial response from server", json.dumps(data, indent=2))
+                yield data
+
+    def make_any_request(
+        self,
+        method: str,
+        path: str,
+        data: dict | None = None,
+        headers: dict | None = None,
+        timeout: float | None = None,
+    ) -> dict:
+        stream = data.get('stream', False)
+        if stream:
+            content: list[str] = []
+            reasoning_content: list[str] = []
+            tool_calls: list[dict] = []
+            finish_reason: Optional[str] = None
+
+            content_parts = 0
+            reasoning_content_parts = 0
+            tool_call_parts = 0
+            arguments_parts = 0
+
+            for chunk in self.make_stream_request(method, path, data, headers):
+                if chunk['choices']:
+                    assert len(chunk['choices']) == 1, f'Expected 1 choice, got {len(chunk["choices"])}'
+                    choice = chunk['choices'][0]
+                    if choice['delta'].get('content') is not None:
+                        assert len(choice['delta']['content']) > 0, f'Expected non empty content delta!'
+                        content.append(choice['delta']['content'])
+                        content_parts += 1
+                    if choice['delta'].get('reasoning_content') is not None:
+                        assert len(choice['delta']['reasoning_content']) > 0, f'Expected non empty reasoning_content delta!'
+                        reasoning_content.append(choice['delta']['reasoning_content'])
+                        reasoning_content_parts += 1
+                    if choice['delta'].get('finish_reason') is not None:
+                        finish_reason = choice['delta']['finish_reason']
+                    for tc in choice['delta'].get('tool_calls', []):
+                        if 'function' not in tc:
+                            raise ValueError(f"Expected function type, got {tc['type']}")
+                        if tc['index'] >= len(tool_calls):
+                            assert 'id' in tc
+                            assert tc.get('type') == 'function'
+                            assert 'function' in tc and 'name' in tc['function'] and len(tc['function']['name']) > 0, \
+                                f"Expected function call with name, got {tc.get('function')}"
+                            tool_calls.append(dict(
+                                id="",
+                                type="function",
+                                function=dict(
+                                    name="",
+                                    arguments="",
+                                )
+                            ))
+                        tool_call = tool_calls[tc['index']]
+                        if tc.get('id') is not None:
+                            tool_call['id'] = tc['id']
+                        fct = tc['function']
+                        assert 'id' not in fct, f"Function call should not have id: {fct}"
+                        if fct.get('name') is not None:
+                            tool_call['function']['name'] = tool_call['function'].get('name', '') + fct['name']
+                        if fct.get('arguments') is not None:
+                            tool_call['function']['arguments'] += fct['arguments']
+                            arguments_parts += 1
+                        tool_call_parts += 1
+                else:
+                    # When `include_usage` is True (the default), we expect the last chunk of the stream
+                    # immediately preceding the `data: [DONE]` message to contain a `choices` field with an empty array
+                    # and a `usage` field containing the usage statistics (n.b., llama-server also returns `timings` in
+                    # the last chunk)
+                    assert 'usage' in chunk, f"Expected finish_reason in chunk: {chunk}"
+                    assert 'timings' in chunk, f"Expected finish_reason in chunk: {chunk}"
+            print(f'Streamed response had {content_parts} content parts, {reasoning_content_parts} reasoning_content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts')
+            result = dict(
+                choices=[
+                    dict(
+                        index=0,
+                        finish_reason=finish_reason,
+                        message=dict(
+                            role='assistant',
+                            content=''.join(content) if content else None,
+                            reasoning_content=''.join(reasoning_content) if reasoning_content else None,
+                            tool_calls=tool_calls if tool_calls else None,
+                        ),
+                    )
+                ],
+            )
+            print("Final response from server", json.dumps(result, indent=2))
+            return result
+        else:
+            response = self.make_request(method, path, data, headers, timeout=timeout)
+            assert response.status_code == 200, f"Server returned error: {response.status_code}"
+            return response.body
+
+
+
+server_instances: Set[ServerProcess] = set()
+
+
+class ServerPreset:
+    @staticmethod
+    def load_all() -> None:
+        """ Load all server presets to ensure model files are cached. """
+        servers: List[ServerProcess] = [
+            method()
+            for name, method in ServerPreset.__dict__.items()
+            if callable(method) and name != "load_all"
+        ]
+        for server in servers:
+            server.offline = False
+            server.start()
+            server.stop()
+
+    @staticmethod
+    def tinyllama2() -> ServerProcess:
+        server = ServerProcess()
+        server.offline = True # will be downloaded by load_all()
+        server.model_hf_repo = "ggml-org/test-model-stories260K"
+        server.model_hf_file = None
+        server.model_alias = "tinyllama-2"
+        server.n_ctx = 512
+        server.n_batch = 32
+        server.n_slots = 2
+        server.n_predict = 64
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def bert_bge_small() -> ServerProcess:
+        server = ServerProcess()
+        server.offline = True # will be downloaded by load_all()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
+        server.model_alias = "bert-bge-small"
+        server.n_ctx = 512
+        server.n_batch = 128
+        server.n_ubatch = 128
+        server.n_slots = 2
+        server.seed = 42
+        server.server_embeddings = True
+        return server
+
+    @staticmethod
+    def bert_bge_small_with_fa() -> ServerProcess:
+        server = ServerProcess()
+        server.offline = True # will be downloaded by load_all()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
+        server.model_alias = "bert-bge-small"
+        server.n_ctx = 1024
+        server.n_batch = 300
+        server.n_ubatch = 300
+        server.n_slots = 2
+        server.fa = "on"
+        server.seed = 42
+        server.server_embeddings = True
+        return server
+
+    @staticmethod
+    def tinyllama_infill() -> ServerProcess:
+        server = ServerProcess()
+        server.offline = True # will be downloaded by load_all()
+        server.model_hf_repo = "ggml-org/test-model-stories260K-infill"
+        server.model_hf_file = None
+        server.model_alias = "tinyllama-infill"
+        server.n_ctx = 2048
+        server.n_batch = 1024
+        server.n_slots = 1
+        server.n_predict = 64
+        server.temperature = 0.0
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def stories15m_moe() -> ServerProcess:
+        server = ServerProcess()
+        server.offline = True # will be downloaded by load_all()
+        server.model_hf_repo = "ggml-org/stories15M_MOE"
+        server.model_hf_file = "stories15M_MOE-F16.gguf"
+        server.model_alias = "stories15m-moe"
+        server.n_ctx = 2048
+        server.n_batch = 1024
+        server.n_slots = 1
+        server.n_predict = 64
+        server.temperature = 0.0
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def jina_reranker_tiny() -> ServerProcess:
+        server = ServerProcess()
+        server.offline = True # will be downloaded by load_all()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
+        server.model_alias = "jina-reranker"
+        server.n_ctx = 512
+        server.n_batch = 512
+        server.n_slots = 1
+        server.seed = 42
+        server.server_reranking = True
+        return server
+
+    @staticmethod
+    def tinygemma3() -> ServerProcess:
+        server = ServerProcess()
+        server.offline = True # will be downloaded by load_all()
+        # mmproj is already provided by HF registry API
+        server.model_hf_file = None
+        server.model_hf_repo = "ggml-org/tinygemma3-GGUF:Q8_0"
+        server.model_alias = "tinygemma3"
+        server.n_ctx = 1024
+        server.n_batch = 32
+        server.n_slots = 2
+        server.n_predict = 4
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def router() -> ServerProcess:
+        server = ServerProcess()
+        server.offline = True # will be downloaded by load_all()
+        # router server has no models
+        server.model_file = None
+        server.model_alias = None
+        server.model_hf_repo = None
+        server.model_hf_file = None
+        server.n_ctx = 1024
+        server.n_batch = 16
+        server.n_slots = 1
+        server.n_predict = 16
+        server.seed = 42
+        return server
+
+
+def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
+    """
+    Run multiple functions in parallel and return results in the same order as calls. Equivalent to Promise.all in JS.
+
+    Example usage:
+
+    results = parallel_function_calls([
+        (func1, (arg1, arg2)),
+        (func2, (arg3, arg4)),
+    ])
+    """
+    results = [None] * len(function_list)
+    exceptions = []
+
+    def worker(index, func, args):
+        try:
+            result = func(*args)
+            results[index] = result
+        except Exception as e:
+            exceptions.append((index, str(e)))
+
+    with ThreadPoolExecutor() as executor:
+        futures = []
+        for i, (func, args) in enumerate(function_list):
+            future = executor.submit(worker, i, func, args)
+            futures.append(future)
+
+        # Wait for all futures to complete
+        for future in as_completed(futures):
+            pass
+
+    # Check if there were any exceptions
+    if exceptions:
+        print("Exceptions occurred:")
+        for index, error in exceptions:
+            print(f"Function at index {index}: {error}")
+
+    return results
+
+
+def match_regex(regex: str, text: str) -> bool:
+    return (
+        re.compile(
+            regex, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL
+        ).search(text)
+        is not None
+    )
+
+
+def download_file(url: str, output_file_path: str | None = None) -> str:
+    """
+    Download a file from a URL to a local path. If the file already exists, it will not be downloaded again.
+
+    output_file_path is the local path to save the downloaded file. If not provided, the file will be saved in the root directory.
+
+    Returns the local path of the downloaded file.
+    """
+    file_name = url.split('/').pop()
+    output_file = f'./tmp/{file_name}' if output_file_path is None else output_file_path
+    if not os.path.exists(output_file):
+        print(f"Downloading {url} to {output_file}")
+        wget.download(url, out=output_file)
+        print(f"Done downloading to {output_file}")
+    else:
+        print(f"File already exists at {output_file}")
+    return output_file
+
+
+def is_slow_test_allowed():
+    return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON"
diff --git a/llama.cpp/tools/server/themes/README.md b/llama.cpp/tools/server/themes/README.md
new file mode 100644
index 0000000..62e721a
--- /dev/null
+++ b/llama.cpp/tools/server/themes/README.md
@@ -0,0 +1,5 @@
+# LLaMA.cpp Server Wild Theme
+
+Simple themes directory of sample "public" directories. To try any of these add --path to your run like `server --path=wild`.
+
+![image](wild/wild.png)
diff --git a/llama.cpp/tools/server/themes/buttons-top/README.md b/llama.cpp/tools/server/themes/buttons-top/README.md
new file mode 100644
index 0000000..808c4cf
--- /dev/null
+++ b/llama.cpp/tools/server/themes/buttons-top/README.md
@@ -0,0 +1,7 @@
+# LLaMA.cpp Server Buttons Top Theme
+
+Simple tweaks to the UI. Chat buttons at the top of the page instead of bottom so you can hit Stop instead of chasing it down the page.
+
+To use simply run server with `--path=themes/buttons_top`
+
+![image](buttons_top.png)
diff --git a/llama.cpp/tools/server/themes/buttons-top/buttons_top.png b/llama.cpp/tools/server/themes/buttons-top/buttons_top.png
new file mode 100644
index 0000000..c544545
Binary files /dev/null and b/llama.cpp/tools/server/themes/buttons-top/buttons_top.png differ
diff --git a/llama.cpp/tools/server/themes/buttons-top/favicon.ico b/llama.cpp/tools/server/themes/buttons-top/favicon.ico
new file mode 100644
index 0000000..89e154a
Binary files /dev/null and b/llama.cpp/tools/server/themes/buttons-top/favicon.ico differ
diff --git a/llama.cpp/tools/server/themes/buttons-top/index.html b/llama.cpp/tools/server/themes/buttons-top/index.html
new file mode 100644
index 0000000..cb5af58
--- /dev/null
+++ b/llama.cpp/tools/server/themes/buttons-top/index.html
@@ -0,0 +1,1052 @@
+<html>
+
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
+  <title>llama.cpp - chat</title>
+
+  <style>
+    body {
+      font-family: system-ui;
+      font-size: 90%;
+    }
+
+    #container {
+      margin: 0em auto;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      height: 100%;
+    }
+
+    main {
+      margin: 3px;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      gap: 1em;
+
+      flex-grow: 1;
+      overflow-y: auto;
+
+      border: 1px solid #ccc;
+      border-radius: 5px;
+      padding: 0.5em;
+    }
+
+    body {
+      max-width: 600px;
+      min-width: 300px;
+      line-height: 1.2;
+      margin: 0 auto;
+      padding: 0 0.5em;
+    }
+
+    p {
+      overflow-wrap: break-word;
+      word-wrap: break-word;
+      hyphens: auto;
+      margin-top: 0.5em;
+      margin-bottom: 0.5em;
+    }
+
+    #write form {
+      margin: 1em 0 0 0;
+      display: flex;
+      flex-direction: column;
+      gap: 0.5em;
+      align-items: stretch;
+    }
+
+    .right {
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+    }
+
+    fieldset {
+      border: none;
+      padding: 0;
+      margin: 0;
+    }
+
+    fieldset.two {
+      display: grid;
+      grid-template: "a a";
+      gap: 1em;
+    }
+
+    fieldset.three {
+      display: grid;
+      grid-template: "a a a";
+      gap: 1em;
+    }
+
+    details {
+      border: 1px solid #aaa;
+      border-radius: 4px;
+      padding: 0.5em 0.5em 0;
+      margin-top: 0.5em;
+    }
+
+    summary {
+      font-weight: bold;
+      margin: -0.5em -0.5em 0;
+      padding: 0.5em;
+      cursor: pointer;
+    }
+
+    details[open] {
+      padding: 0.5em;
+    }
+
+    .prob-set {
+      padding: 0.3em;
+      border-bottom: 1px solid #ccc;
+    }
+
+    .popover-content {
+      position: absolute;
+      background-color: white;
+      padding: 0.2em;
+      box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+    }
+
+    textarea {
+      padding: 5px;
+      flex-grow: 1;
+      width: 100%;
+    }
+
+    pre code {
+      display: block;
+      background-color: #222;
+      color: #ddd;
+    }
+
+    code {
+      font-family: monospace;
+      padding: 0.1em 0.3em;
+      border-radius: 3px;
+    }
+
+    fieldset label {
+      margin: 0.5em 0;
+      display: block;
+    }
+
+    fieldset label.slim {
+      margin: 0 0.5em;
+      display: inline;
+    }
+
+    header,
+    footer {
+      text-align: center;
+    }
+
+    footer {
+      font-size: 80%;
+      color: #888;
+    }
+
+    .mode-chat textarea[name=prompt] {
+      height: 4.5em;
+    }
+
+    .mode-completion textarea[name=prompt] {
+      height: 10em;
+    }
+
+    [contenteditable] {
+      display: inline-block;
+      white-space: pre-wrap;
+      outline: 0px solid transparent;
+    }
+
+    @keyframes loading-bg-wipe {
+      0% {
+        background-position: 0%;
+      }
+
+      100% {
+        background-position: 100%;
+      }
+    }
+
+    .loading {
+      --loading-color-1: #eeeeee00;
+      --loading-color-2: #eeeeeeff;
+      background-size: 50% 100%;
+      background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
+      animation: loading-bg-wipe 2s linear infinite;
+    }
+
+    @media (prefers-color-scheme: dark) {
+      .loading {
+        --loading-color-1: #22222200;
+        --loading-color-2: #222222ff;
+      }
+
+      .popover-content {
+        background-color: black;
+      }
+    }
+  </style>
+
+  <script type="module">
+    import {
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
+    } from './index.js';
+
+    import { llama } from './completion.js';
+    import { SchemaConverter } from './json-schema-to-grammar.mjs';
+    let selected_image = false;
+    var slot_id = -1;
+
+    const session = signal({
+      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
+      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
+      historyTemplate: "{{name}}: {{message}}",
+      transcript: [],
+      type: "chat",  // "chat" | "completion"
+      char: "Llama",
+      user: "User",
+      image_selected: ''
+    })
+
+    const params = signal({
+      n_predict: 400,
+      temperature: 0.7,
+      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
+      repeat_penalty: 1.18, // 1.0 = disabled
+      top_k: 40, // <= 0 to use vocab size
+      top_p: 0.95, // 1.0 = disabled
+      min_p: 0.05, // 0 = disabled
+      typical_p: 1.0, // 1.0 = disabled
+      presence_penalty: 0.0, // 0.0 = disabled
+      frequency_penalty: 0.0, // 0.0 = disabled
+      mirostat: 0, // 0/1/2
+      mirostat_tau: 5, // target entropy
+      mirostat_eta: 0.1, // learning rate
+      grammar: '',
+      n_probs: 0, // no completion_probabilities,
+      min_keep: 0, // min probs from each sampler,
+      image_data: [],
+      cache_prompt: true,
+      api_key: ''
+    })
+
+    /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
+
+    const local_storage_storageKey = "llamacpp_server_local_storage";
+
+    function local_storage_setDataFromObject(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
+    }
+
+    function local_storage_setDataFromRawText(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
+    }
+
+    function local_storage_getDataAsObject(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return JSON.parse(item);
+      }
+    }
+
+    function local_storage_getDataAsRawText(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return item;
+      }
+    }
+
+    // create a container for user templates and settings
+
+    const savedUserTemplates = signal({})
+    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
+
+    // let's import locally saved templates and settings if there are any
+    // user templates and settings are stored in one object
+    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
+
+    console.log('Importing saved templates')
+
+    let importedTemplates = local_storage_getDataAsObject('user_templates')
+
+    if (importedTemplates) {
+      // saved templates were successfully imported.
+
+      console.log('Processing saved templates and updating default template')
+      params.value = { ...params.value, image_data: [] };
+
+      //console.log(importedTemplates);
+      savedUserTemplates.value = importedTemplates;
+
+      //override default template
+      savedUserTemplates.value.default = { session: session.value, params: params.value }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    } else {
+      // no saved templates detected.
+
+      console.log('Initializing LocalStorage and saving default template')
+
+      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    }
+
+    function userTemplateResetToDefault() {
+      console.log('Resetting template to default')
+      selectedUserTemplate.value.name = 'default';
+      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
+    }
+
+    function userTemplateApply(t) {
+      session.value = t.data.session;
+      session.value = { ...session.value, image_selected: '' };
+      params.value = t.data.params;
+      params.value = { ...params.value, image_data: [] };
+    }
+
+    function userTemplateResetToDefaultAndApply() {
+      userTemplateResetToDefault()
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    function userTemplateLoadAndApplyAutosaved() {
+      // get autosaved last used template
+      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
+
+      if (lastUsedTemplate) {
+
+        console.log('Autosaved template found, restoring')
+
+        selectedUserTemplate.value = lastUsedTemplate
+      }
+      else {
+
+        console.log('No autosaved template found, using default template')
+        // no autosaved last used template was found, so load from default.
+
+        userTemplateResetToDefault()
+      }
+
+      console.log('Applying template')
+      // and update internal data from templates
+
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    //console.log(savedUserTemplates.value)
+    //console.log(selectedUserTemplate.value)
+
+    function userTemplateAutosave() {
+      console.log('Template Autosave...')
+      if (selectedUserTemplate.value.name == 'default') {
+        // we don't want to save over default template, so let's create a new one
+        let newTemplateName = 'UserTemplate-' + Date.now().toString()
+        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
+
+        console.log('Saving as ' + newTemplateName)
+
+        // save in the autosave slot
+        local_storage_setDataFromObject('user_templates_last', newTemplate)
+
+        // and load it back and apply
+        userTemplateLoadAndApplyAutosaved()
+      } else {
+        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
+      }
+    }
+
+    console.log('Checking for autosaved last used template')
+    userTemplateLoadAndApplyAutosaved()
+
+    /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
+
+    const llamaStats = signal(null)
+    const controller = signal(null)
+
+    // currently generating a completion?
+    const generating = computed(() => controller.value != null)
+
+    // has the user started a chat?
+    const chatStarted = computed(() => session.value.transcript.length > 0)
+
+    const transcriptUpdate = (transcript) => {
+      session.value = {
+        ...session.value,
+        transcript
+      }
+    }
+
+    // simple template replace
+    const template = (str, extraSettings) => {
+      let settings = session.value;
+      if (extraSettings) {
+        settings = { ...settings, ...extraSettings };
+      }
+      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
+    }
+
+    async function runLlama(prompt, llamaParams, char) {
+      const currentMessages = [];
+      const history = session.value.transcript;
+      if (controller.value) {
+        throw new Error("already running");
+      }
+      controller.value = new AbortController();
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
+        const data = chunk.data;
+
+        if (data.stop) {
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          slot_id = data.slot_id;
+          if (selected_image && !data.multimodal) {
+            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
+            return;
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+        }
+
+        if (data.timings) {
+          llamaStats.value = data;
+        }
+      }
+
+      controller.value = null;
+    }
+
+    // send message to server
+    const chat = async (msg) => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+
+      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
+
+      let prompt = template(session.value.template, {
+        message: msg,
+        history: session.value.transcript.flatMap(
+          ([name, data]) =>
+            template(
+              session.value.historyTemplate,
+              {
+                name,
+                message: Array.isArray(data) ?
+                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
+                  data,
+              }
+            )
+        ).join("\n"),
+      });
+      if (selected_image) {
+        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
+      }
+      await runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
+      }, "{{char}}");
+    }
+
+    const runCompletion = () => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+      const { prompt } = session.value;
+      transcriptUpdate([...session.value.transcript, ["", prompt]]);
+      runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: [],
+      }, "").finally(() => {
+        session.value.prompt = session.value.transcript.map(([_, data]) =>
+          Array.isArray(data) ? data.map(msg => msg.content).join('') : data
+        ).join('');
+        session.value.transcript = [];
+      })
+    }
+
+    const stop = (e) => {
+      e.preventDefault();
+      if (controller.value) {
+        controller.value.abort();
+        controller.value = null;
+      }
+    }
+
+    const reset = (e) => {
+      stop(e);
+      transcriptUpdate([]);
+    }
+
+    const uploadImage = (e) => {
+      e.preventDefault();
+      document.getElementById("fileInput").click();
+      document.getElementById("fileInput").addEventListener("change", function (event) {
+        const selectedFile = event.target.files[0];
+        if (selectedFile) {
+          const reader = new FileReader();
+          reader.onload = function () {
+            const image_data = reader.result;
+            session.value = { ...session.value, image_selected: image_data };
+            params.value = {
+              ...params.value, image_data: [
+                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
+            }
+          };
+          selected_image = true;
+          reader.readAsDataURL(selectedFile);
+        }
+      });
+    }
+
+    function MessageInput() {
+      const message = useSignal("")
+
+      const submit = (e) => {
+        stop(e);
+        chat(message.value);
+        message.value = "";
+      }
+
+      const enterSubmits = (event) => {
+        if (event.which === 13 && !event.shiftKey) {
+          submit(event);
+        }
+      }
+
+      return html`
+        <form onsubmit=${submit}>
+          <div>
+            <textarea
+               className=${generating.value ? "loading" : null}
+               oninput=${(e) => message.value = e.target.value}
+               onkeypress=${enterSubmits}
+               placeholder="Say something..."
+               rows=2
+               type="text"
+               value="${message}"
+            />
+          </div>
+          <div class="right">
+            <button type="submit" disabled=${generating.value}>Send</button>
+            <button onclick=${uploadImage}>Upload Image</button>
+            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+            <button onclick=${reset}>Reset</button>
+          </div>
+        </form>
+      `
+    }
+
+    function CompletionControls() {
+      const submit = (e) => {
+        stop(e);
+        runCompletion();
+      }
+      return html`
+        <div>
+          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
+          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+          <button onclick=${reset}>Reset</button>
+        </div>`;
+    }
+
+    const ChatLog = (props) => {
+      const messages = session.value.transcript;
+      const container = useRef(null)
+
+      useEffect(() => {
+        // scroll to bottom (if needed)
+        const parent = container.current.parentElement;
+        if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) {
+          parent.scrollTo(0, parent.scrollHeight)
+        }
+      }, [messages])
+
+      const isCompletionMode = session.value.type === 'completion'
+      const chatLine = ([user, data], index) => {
+        let message
+        const isArrayMessage = Array.isArray(data)
+        if (params.value.n_probs > 0 && isArrayMessage) {
+          message = html`<${Probabilities} data=${data} />`
+        } else {
+          const text = isArrayMessage ?
+            data.map(msg => msg.content).join('').replace(/^\s+/, '') :
+            data;
+          message = isCompletionMode ?
+            text :
+            html`<${Markdownish} text=${template(text)} />`
+        }
+        if (user) {
+          return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
+        } else {
+          return isCompletionMode ?
+            html`<span key=${index}>${message}</span>` :
+            html`<p key=${index}>${message}</p>`
+        }
+      };
+
+      const handleCompletionEdit = (e) => {
+        session.value.prompt = e.target.innerText;
+        session.value.transcript = [];
+      }
+
+      return html`
+        <div id="chat" ref=${container} key=${messages.length}>
+          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
+          <span contenteditable=${isCompletionMode} ref=${container} oninput=${handleCompletionEdit}>
+            ${messages.flatMap(chatLine)}
+          </span>
+        </div>`;
+    };
+
+    const ConfigForm = (props) => {
+      const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
+      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
+      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
+      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+      const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }
+
+      const grammarJsonSchemaPropOrder = signal('')
+      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
+      const convertJSONSchemaGrammar = async () => {
+        try {
+          let schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter({
+            prop_order: grammarJsonSchemaPropOrder.value
+              .split(',')
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
+            allow_fetch: true,
+          })
+          schema = await converter.resolveRefs(schema, 'input')
+          converter.visit(schema, '')
+          params.value = {
+            ...params.value,
+            grammar: converter.formatGrammar(),
+          }
+        } catch (e) {
+          alert(`Convert failed: ${e.message}`)
+        }
+      }
+
+      const FloatField = ({ label, max, min, name, step, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const IntField = ({ label, max, min, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const BoolField = ({ label, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} />
+          </div>
+        `
+      };
+
+      const userTemplateReset = (e) => {
+        e.preventDefault();
+        userTemplateResetToDefaultAndApply()
+      }
+
+      const UserTemplateResetButton = () => {
+        if (selectedUserTemplate.value.name == 'default') {
+          return html`
+            <button disabled>Using default template</button>
+          `
+        }
+
+        return html`
+          <button onclick=${userTemplateReset}>Reset all to default</button>
+        `
+      };
+
+      useEffect(() => {
+        // autosave template on every change
+        userTemplateAutosave()
+      }, [session.value, params.value])
+
+      const GrammarControl = () => (
+        html`
+          <div>
+            <label for="template">Grammar</label>
+            <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+            <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+            <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+          </div>
+          `
+      );
+
+      const PromptControlFieldSet = () => (
+        html`
+        <fieldset>
+          <div>
+            <label htmlFor="prompt">Prompt</label>
+            <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
+          </div>
+        </fieldset>
+        `
+      );
+
+      const ChatConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+
+          <fieldset class="two">
+            <div>
+              <label for="user">User name</label>
+              <input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
+            </div>
+
+            <div>
+              <label for="bot">Bot name</label>
+              <input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
+            </div>
+          </fieldset>
+
+          <fieldset>
+            <div>
+              <label for="template">Prompt template</label>
+              <textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
+            </div>
+
+            <div>
+              <label for="template">Chat history template</label>
+              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
+            </div>
+            ${GrammarControl()}
+          </fieldset>
+      `
+      );
+
+      const CompletionConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+          <fieldset>${GrammarControl()}</fieldset>
+        `
+      );
+
+      return html`
+        <form>
+          <fieldset class="two">
+            <${UserTemplateResetButton}/>
+            <div>
+              <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
+              <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
+            </div>
+          </fieldset>
+
+          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
+
+          <fieldset class="two">
+            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
+            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
+            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
+            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
+            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
+            ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
+          </fieldset>
+          <details>
+            <summary>More options</summary>
+            <fieldset class="two">
+              ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+              ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
+              ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+            </fieldset>
+            <hr />
+            <fieldset class="three">
+              <div>
+                <label><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> no Mirostat</label>
+                <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
+                <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
+              </div>
+              ${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
+              ${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
+            </fieldset>
+            <fieldset>
+              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
+            </fieldset>
+            <fieldset>
+              ${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
+            </fieldset>
+            <fieldset>
+              <label for="api_key">API Key</label>
+              <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
+            </fieldset>
+          </details>
+        </form>
+      `
+    }
+
+    const probColor = (p) => {
+      const r = Math.floor(192 * (1 - p));
+      const g = Math.floor(192 * p);
+      return `rgba(${r},${g},0,0.3)`;
+    }
+
+    const Probabilities = (params) => {
+      return params.data.map(msg => {
+        const { completion_probabilities } = msg;
+        if (
+          !completion_probabilities ||
+          completion_probabilities.length === 0
+        ) return msg.content
+
+        if (completion_probabilities.length > 1) {
+          // Not for byte pair
+          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
+
+          const splitData = completion_probabilities.map(prob => ({
+            content: prob.content,
+            completion_probabilities: [prob]
+          }))
+          return html`<${Probabilities} data=${splitData} />`
+        }
+
+        const { probs, content } = completion_probabilities[0]
+        const found = probs.find(p => p.tok_str === msg.content)
+        const pColor = found ? probColor(found.prob) : 'transparent'
+
+        const popoverChildren = html`
+          <div class="prob-set">
+            ${probs.map((p, index) => {
+          return html`
+                <div
+                  key=${index}
+                  title=${`prob: ${p.prob}`}
+                  style=${{
+              padding: '0.3em',
+              backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+            }}
+                >
+                  <span>${p.tok_str}: </span>
+                  <span>${Math.floor(p.prob * 100)}%</span>
+                </div>
+              `
+        })}
+          </div>
+        `
+
+        return html`
+          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
+            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
+          </>
+        `
+      });
+    }
+
+    // poor mans markdown replacement
+    const Markdownish = (params) => {
+      const md = params.text
+        .replace(/&/g, '&amp;')
+        .replace(/</g, '&lt;')
+        .replace(/>/g, '&gt;')
+        .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
+        .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
+        .replace(/__(.*?)__/g, '<strong>$1</strong>')
+        .replace(/\*(.*?)\*/g, '<em>$1</em>')
+        .replace(/_(.*?)_/g, '<em>$1</em>')
+        .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
+        .replace(/`(.*?)`/g, '<code>$1</code>')
+        .replace(/\n/gim, '<br />');
+      return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
+    };
+
+    const ModelGenerationInfo = (params) => {
+      if (!llamaStats.value) {
+        return html`<span/>`
+      }
+      return html`
+        <span>
+          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+        </span>
+      `
+    }
+
+    // simple popover impl
+    const Popover = (props) => {
+      const isOpen = useSignal(false);
+      const position = useSignal({ top: '0px', left: '0px' });
+      const buttonRef = useRef(null);
+      const popoverRef = useRef(null);
+
+      const togglePopover = () => {
+        if (buttonRef.current) {
+          const rect = buttonRef.current.getBoundingClientRect();
+          position.value = {
+            top: `${rect.bottom + window.scrollY}px`,
+            left: `${rect.left + window.scrollX}px`,
+          };
+        }
+        isOpen.value = !isOpen.value;
+      };
+
+      const handleClickOutside = (event) => {
+        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
+          isOpen.value = false;
+        }
+      };
+
+      useEffect(() => {
+        document.addEventListener('mousedown', handleClickOutside);
+        return () => {
+          document.removeEventListener('mousedown', handleClickOutside);
+        };
+      }, []);
+
+      return html`
+        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
+        ${isOpen.value && html`
+          <${Portal} into="#portal">
+            <div
+              ref=${popoverRef}
+              class="popover-content"
+              style=${{
+            top: position.value.top,
+            left: position.value.left,
+          }}
+            >
+              ${props.popoverChildren}
+            </div>
+          </${Portal}>
+        `}
+      `;
+    };
+
+    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
+    /** Redirect rendering of descendants into the given CSS selector */
+    class Portal extends Component {
+      componentDidUpdate(props) {
+        for (let i in props) {
+          if (props[i] !== this.props[i]) {
+            return setTimeout(this.renderLayer);
+          }
+        }
+      }
+
+      componentDidMount() {
+        this.isMounted = true;
+        this.renderLayer = this.renderLayer.bind(this);
+        this.renderLayer();
+      }
+
+      componentWillUnmount() {
+        this.renderLayer(false);
+        this.isMounted = false;
+        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
+      }
+
+      findNode(node) {
+        return typeof node === 'string' ? document.querySelector(node) : node;
+      }
+
+      renderLayer(show = true) {
+        if (!this.isMounted) return;
+
+        // clean up old node if moving bases:
+        if (this.props.into !== this.intoPointer) {
+          this.intoPointer = this.props.into;
+          if (this.into && this.remote) {
+            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
+          }
+          this.into = this.findNode(this.props.into);
+        }
+
+        this.remote = render(html`
+          <${PortalProxy} context=${this.context}>
+            ${show && this.props.children || null}
+          </${PortalProxy}>
+        `, this.into, this.remote);
+      }
+
+      render() {
+        return null;
+      }
+    }
+    // high-order component that renders its first child if it exists.
+    // used as a conditional rendering proxy.
+    class PortalProxy extends Component {
+      getChildContext() {
+        return this.props.context;
+      }
+      render({ children }) {
+        return children || null;
+      }
+    }
+
+    function App(props) {
+      useEffect(() => {
+        const query = new URLSearchParams(location.search).get("q");
+        if (query) chat(query);
+      }, []);
+
+      return html`
+        <div class="mode-${session.value.type}">
+          <header>
+            <h1>llama.cpp</h1>
+          </header>
+
+          <section id="write">
+            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
+          </section>
+
+          <main id="content">
+            <${chatStarted.value ? ChatLog : ConfigForm} />
+          </main>
+
+          <footer>
+            <p><${ModelGenerationInfo} /></p>
+            <p>Powered by <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+          </footer>
+        </div>
+      `;
+    }
+
+    render(h(App), document.querySelector('#container'));
+  </script>
+</head>
+
+<body>
+  <div id="container">
+    <input type="file" id="fileInput" accept="image/*" style="display: none;">
+  </div>
+  <div id="portal"></div>
+</body>
+
+</html>
diff --git a/llama.cpp/tools/server/themes/wild/README.md b/llama.cpp/tools/server/themes/wild/README.md
new file mode 100644
index 0000000..560bcc8
--- /dev/null
+++ b/llama.cpp/tools/server/themes/wild/README.md
@@ -0,0 +1,5 @@
+# LLaMA.cpp Server Wild Theme
+
+Simple tweaks to the UI. To use simply run server with `--path=themes/wild`
+
+![image](wild.png)
diff --git a/llama.cpp/tools/server/themes/wild/favicon.ico b/llama.cpp/tools/server/themes/wild/favicon.ico
new file mode 100644
index 0000000..89e154a
Binary files /dev/null and b/llama.cpp/tools/server/themes/wild/favicon.ico differ
diff --git a/llama.cpp/tools/server/themes/wild/index.html b/llama.cpp/tools/server/themes/wild/index.html
new file mode 100644
index 0000000..601f776
--- /dev/null
+++ b/llama.cpp/tools/server/themes/wild/index.html
@@ -0,0 +1,1056 @@
+<html>
+
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
+  <title>llama.cpp - chat</title>
+
+  <style>
+    body {
+      font-family: system-ui;
+      font-size: 90%;
+      background-image: url('llamapattern.png');
+    }
+
+    #container {
+      margin: 0em auto;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      height: 100%;
+    }
+
+    main {
+      margin: 3px;
+      display: flex;
+      flex-direction: column;
+      justify-content: space-between;
+      gap: 1em;
+
+      flex-grow: 1;
+      overflow-y: auto;
+
+      border: 1px solid #ccc;
+      border-radius: 5px;
+      padding: 0.5em;
+
+      background-color: rgba(255,255,255,0.9);
+    }
+
+    body {
+      max-width: 600px;
+      min-width: 300px;
+      line-height: 1.2;
+      margin: 0 auto;
+      padding: 0 0.5em;
+    }
+
+    p {
+      overflow-wrap: break-word;
+      word-wrap: break-word;
+      hyphens: auto;
+      margin-top: 0.5em;
+      margin-bottom: 0.5em;
+    }
+
+    #write form {
+      margin: 1em 0 0 0;
+      display: flex;
+      flex-direction: column;
+      gap: 0.5em;
+      align-items: stretch;
+    }
+
+    .right {
+      display: flex;
+      flex-direction: row;
+      gap: 0.5em;
+      justify-content: flex-end;
+    }
+
+    fieldset {
+      border: none;
+      padding: 0;
+      margin: 0;
+    }
+
+    fieldset.two {
+      display: grid;
+      grid-template: "a a";
+      gap: 1em;
+    }
+
+    fieldset.three {
+      display: grid;
+      grid-template: "a a a";
+      gap: 1em;
+    }
+
+    details {
+      border: 1px solid #aaa;
+      border-radius: 4px;
+      padding: 0.5em 0.5em 0;
+      margin-top: 0.5em;
+    }
+
+    summary {
+      font-weight: bold;
+      margin: -0.5em -0.5em 0;
+      padding: 0.5em;
+      cursor: pointer;
+    }
+
+    details[open] {
+      padding: 0.5em;
+    }
+
+    .prob-set {
+      padding: 0.3em;
+      border-bottom: 1px solid #ccc;
+    }
+
+    .popover-content {
+      position: absolute;
+      background-color: white;
+      padding: 0.2em;
+      box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+    }
+
+    textarea {
+      padding: 5px;
+      flex-grow: 1;
+      width: 100%;
+    }
+
+    pre code {
+      display: block;
+      background-color: #222;
+      color: #ddd;
+    }
+
+    code {
+      font-family: monospace;
+      padding: 0.1em 0.3em;
+      border-radius: 3px;
+    }
+
+    fieldset label {
+      margin: 0.5em 0;
+      display: block;
+    }
+
+    fieldset label.slim {
+      margin: 0 0.5em;
+      display: inline;
+    }
+
+    header,
+    footer {
+      text-align: center;
+    }
+
+    footer {
+      font-size: 80%;
+      color: #888;
+    }
+
+    .mode-chat textarea[name=prompt] {
+      height: 4.5em;
+    }
+
+    .mode-completion textarea[name=prompt] {
+      height: 10em;
+    }
+
+    [contenteditable] {
+      display: inline-block;
+      white-space: pre-wrap;
+      outline: 0px solid transparent;
+    }
+
+    @keyframes loading-bg-wipe {
+      0% {
+        background-position: 0%;
+      }
+
+      100% {
+        background-position: 100%;
+      }
+    }
+
+    .loading {
+      --loading-color-1: #eeeeee00;
+      --loading-color-2: #eeeeeeff;
+      background-size: 50% 100%;
+      background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
+      animation: loading-bg-wipe 2s linear infinite;
+    }
+
+    @media (prefers-color-scheme: dark) {
+      .loading {
+        --loading-color-1: #22222200;
+        --loading-color-2: #222222ff;
+      }
+
+      .popover-content {
+        background-color: black;
+      }
+    }
+  </style>
+
+  <script type="module">
+    import {
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
+    } from './index.js';
+
+    import { llama } from './completion.js';
+    import { SchemaConverter } from './json-schema-to-grammar.mjs';
+    let selected_image = false;
+    var slot_id = -1;
+
+    const session = signal({
+      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
+      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
+      historyTemplate: "{{name}}: {{message}}",
+      transcript: [],
+      type: "chat",  // "chat" | "completion"
+      char: "Llama",
+      user: "User",
+      image_selected: ''
+    })
+
+    const params = signal({
+      n_predict: 400,
+      temperature: 0.7,
+      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
+      repeat_penalty: 1.18, // 1.0 = disabled
+      top_k: 40, // <= 0 to use vocab size
+      top_p: 0.95, // 1.0 = disabled
+      min_p: 0.05, // 0 = disabled
+      typical_p: 1.0, // 1.0 = disabled
+      presence_penalty: 0.0, // 0.0 = disabled
+      frequency_penalty: 0.0, // 0.0 = disabled
+      mirostat: 0, // 0/1/2
+      mirostat_tau: 5, // target entropy
+      mirostat_eta: 0.1, // learning rate
+      grammar: '',
+      n_probs: 0, // no completion_probabilities,
+      min_keep: 0, // min probs from each sampler,
+      image_data: [],
+      cache_prompt: true,
+      api_key: ''
+    })
+
+    /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
+
+    const local_storage_storageKey = "llamacpp_server_local_storage";
+
+    function local_storage_setDataFromObject(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
+    }
+
+    function local_storage_setDataFromRawText(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
+    }
+
+    function local_storage_getDataAsObject(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return JSON.parse(item);
+      }
+    }
+
+    function local_storage_getDataAsRawText(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return item;
+      }
+    }
+
+    // create a container for user templates and settings
+
+    const savedUserTemplates = signal({})
+    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
+
+    // let's import locally saved templates and settings if there are any
+    // user templates and settings are stored in one object
+    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
+
+    console.log('Importing saved templates')
+
+    let importedTemplates = local_storage_getDataAsObject('user_templates')
+
+    if (importedTemplates) {
+      // saved templates were successfully imported.
+
+      console.log('Processing saved templates and updating default template')
+      params.value = { ...params.value, image_data: [] };
+
+      //console.log(importedTemplates);
+      savedUserTemplates.value = importedTemplates;
+
+      //override default template
+      savedUserTemplates.value.default = { session: session.value, params: params.value }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    } else {
+      // no saved templates detected.
+
+      console.log('Initializing LocalStorage and saving default template')
+
+      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    }
+
+    function userTemplateResetToDefault() {
+      console.log('Resetting template to default')
+      selectedUserTemplate.value.name = 'default';
+      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
+    }
+
+    function userTemplateApply(t) {
+      session.value = t.data.session;
+      session.value = { ...session.value, image_selected: '' };
+      params.value = t.data.params;
+      params.value = { ...params.value, image_data: [] };
+    }
+
+    function userTemplateResetToDefaultAndApply() {
+      userTemplateResetToDefault()
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    function userTemplateLoadAndApplyAutosaved() {
+      // get autosaved last used template
+      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
+
+      if (lastUsedTemplate) {
+
+        console.log('Autosaved template found, restoring')
+
+        selectedUserTemplate.value = lastUsedTemplate
+      }
+      else {
+
+        console.log('No autosaved template found, using default template')
+        // no autosaved last used template was found, so load from default.
+
+        userTemplateResetToDefault()
+      }
+
+      console.log('Applying template')
+      // and update internal data from templates
+
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    //console.log(savedUserTemplates.value)
+    //console.log(selectedUserTemplate.value)
+
+    function userTemplateAutosave() {
+      console.log('Template Autosave...')
+      if (selectedUserTemplate.value.name == 'default') {
+        // we don't want to save over default template, so let's create a new one
+        let newTemplateName = 'UserTemplate-' + Date.now().toString()
+        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
+
+        console.log('Saving as ' + newTemplateName)
+
+        // save in the autosave slot
+        local_storage_setDataFromObject('user_templates_last', newTemplate)
+
+        // and load it back and apply
+        userTemplateLoadAndApplyAutosaved()
+      } else {
+        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
+      }
+    }
+
+    console.log('Checking for autosaved last used template')
+    userTemplateLoadAndApplyAutosaved()
+
+    /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
+
+    const llamaStats = signal(null)
+    const controller = signal(null)
+
+    // currently generating a completion?
+    const generating = computed(() => controller.value != null)
+
+    // has the user started a chat?
+    const chatStarted = computed(() => session.value.transcript.length > 0)
+
+    const transcriptUpdate = (transcript) => {
+      session.value = {
+        ...session.value,
+        transcript
+      }
+    }
+
+    // simple template replace
+    const template = (str, extraSettings) => {
+      let settings = session.value;
+      if (extraSettings) {
+        settings = { ...settings, ...extraSettings };
+      }
+      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
+    }
+
+    async function runLlama(prompt, llamaParams, char) {
+      const currentMessages = [];
+      const history = session.value.transcript;
+      if (controller.value) {
+        throw new Error("already running");
+      }
+      controller.value = new AbortController();
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
+        const data = chunk.data;
+
+        if (data.stop) {
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          slot_id = data.slot_id;
+          if (selected_image && !data.multimodal) {
+            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
+            return;
+          }
+          transcriptUpdate([...history, [char, currentMessages]])
+        }
+
+        if (data.timings) {
+          llamaStats.value = data;
+        }
+      }
+
+      controller.value = null;
+    }
+
+    // send message to server
+    const chat = async (msg) => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+
+      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
+
+      let prompt = template(session.value.template, {
+        message: msg,
+        history: session.value.transcript.flatMap(
+          ([name, data]) =>
+            template(
+              session.value.historyTemplate,
+              {
+                name,
+                message: Array.isArray(data) ?
+                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
+                  data,
+              }
+            )
+        ).join("\n"),
+      });
+      if (selected_image) {
+        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
+      }
+      await runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
+      }, "{{char}}");
+    }
+
+    const runCompletion = () => {
+      if (controller.value) {
+        console.log('already running...');
+        return;
+      }
+      const { prompt } = session.value;
+      transcriptUpdate([...session.value.transcript, ["", prompt]]);
+      runLlama(prompt, {
+        ...params.value,
+        slot_id: slot_id,
+        stop: [],
+      }, "").finally(() => {
+        session.value.prompt = session.value.transcript.map(([_, data]) =>
+          Array.isArray(data) ? data.map(msg => msg.content).join('') : data
+        ).join('');
+        session.value.transcript = [];
+      })
+    }
+
+    const stop = (e) => {
+      e.preventDefault();
+      if (controller.value) {
+        controller.value.abort();
+        controller.value = null;
+      }
+    }
+
+    const reset = (e) => {
+      stop(e);
+      transcriptUpdate([]);
+    }
+
+    const uploadImage = (e) => {
+      e.preventDefault();
+      document.getElementById("fileInput").click();
+      document.getElementById("fileInput").addEventListener("change", function (event) {
+        const selectedFile = event.target.files[0];
+        if (selectedFile) {
+          const reader = new FileReader();
+          reader.onload = function () {
+            const image_data = reader.result;
+            session.value = { ...session.value, image_selected: image_data };
+            params.value = {
+              ...params.value, image_data: [
+                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
+            }
+          };
+          selected_image = true;
+          reader.readAsDataURL(selectedFile);
+        }
+      });
+    }
+
+    function MessageInput() {
+      const message = useSignal("")
+
+      const submit = (e) => {
+        stop(e);
+        chat(message.value);
+        message.value = "";
+      }
+
+      const enterSubmits = (event) => {
+        if (event.which === 13 && !event.shiftKey) {
+          submit(event);
+        }
+      }
+
+      return html`
+        <form onsubmit=${submit}>
+          <div>
+            <textarea
+               className=${generating.value ? "loading" : null}
+               oninput=${(e) => message.value = e.target.value}
+               onkeypress=${enterSubmits}
+               placeholder="Say something..."
+               rows=2
+               type="text"
+               value="${message}"
+            />
+          </div>
+          <div class="right">
+            <button type="submit" disabled=${generating.value}>Send</button>
+            <button onclick=${uploadImage}>Upload Image</button>
+            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+            <button onclick=${reset}>Reset</button>
+          </div>
+        </form>
+      `
+    }
+
+    function CompletionControls() {
+      const submit = (e) => {
+        stop(e);
+        runCompletion();
+      }
+      return html`
+        <div>
+          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
+          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+          <button onclick=${reset}>Reset</button>
+        </div>`;
+    }
+
+    const ChatLog = (props) => {
+      const messages = session.value.transcript;
+      const container = useRef(null)
+
+      useEffect(() => {
+        // scroll to bottom (if needed)
+        const parent = container.current.parentElement;
+        if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) {
+          parent.scrollTo(0, parent.scrollHeight)
+        }
+      }, [messages])
+
+      const isCompletionMode = session.value.type === 'completion'
+      const chatLine = ([user, data], index) => {
+        let message
+        const isArrayMessage = Array.isArray(data)
+        if (params.value.n_probs > 0 && isArrayMessage) {
+          message = html`<${Probabilities} data=${data} />`
+        } else {
+          const text = isArrayMessage ?
+            data.map(msg => msg.content).join('').replace(/^\s+/, '') :
+            data;
+          message = isCompletionMode ?
+            text :
+            html`<${Markdownish} text=${template(text)} />`
+        }
+        if (user) {
+          return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
+        } else {
+          return isCompletionMode ?
+            html`<span key=${index}>${message}</span>` :
+            html`<p key=${index}>${message}</p>`
+        }
+      };
+
+      const handleCompletionEdit = (e) => {
+        session.value.prompt = e.target.innerText;
+        session.value.transcript = [];
+      }
+
+      return html`
+        <div id="chat" ref=${container} key=${messages.length}>
+          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
+          <span contenteditable=${isCompletionMode} ref=${container} oninput=${handleCompletionEdit}>
+            ${messages.flatMap(chatLine)}
+          </span>
+        </div>`;
+    };
+
+    const ConfigForm = (props) => {
+      const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
+      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
+      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
+      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+      const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }
+
+      const grammarJsonSchemaPropOrder = signal('')
+      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
+      const convertJSONSchemaGrammar = async () => {
+        try {
+          let schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter({
+            prop_order: grammarJsonSchemaPropOrder.value
+              .split(',')
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
+            allow_fetch: true,
+          })
+          schema = await converter.resolveRefs(schema, 'input')
+          converter.visit(schema, '')
+          params.value = {
+            ...params.value,
+            grammar: converter.formatGrammar(),
+          }
+        } catch (e) {
+          alert(`Convert failed: ${e.message}`)
+        }
+      }
+
+      const FloatField = ({ label, max, min, name, step, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const IntField = ({ label, max, min, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const BoolField = ({ label, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} />
+          </div>
+        `
+      };
+
+      const userTemplateReset = (e) => {
+        e.preventDefault();
+        userTemplateResetToDefaultAndApply()
+      }
+
+      const UserTemplateResetButton = () => {
+        if (selectedUserTemplate.value.name == 'default') {
+          return html`
+            <button disabled>Using default template</button>
+          `
+        }
+
+        return html`
+          <button onclick=${userTemplateReset}>Reset all to default</button>
+        `
+      };
+
+      useEffect(() => {
+        // autosave template on every change
+        userTemplateAutosave()
+      }, [session.value, params.value])
+
+      const GrammarControl = () => (
+        html`
+          <div>
+            <label for="template">Grammar</label>
+            <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+            <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+            <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+          </div>
+          `
+      );
+
+      const PromptControlFieldSet = () => (
+        html`
+        <fieldset>
+          <div>
+            <label htmlFor="prompt">Prompt</label>
+            <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
+          </div>
+        </fieldset>
+        `
+      );
+
+      const ChatConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+
+          <fieldset class="two">
+            <div>
+              <label for="user">User name</label>
+              <input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
+            </div>
+
+            <div>
+              <label for="bot">Bot name</label>
+              <input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
+            </div>
+          </fieldset>
+
+          <fieldset>
+            <div>
+              <label for="template">Prompt template</label>
+              <textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
+            </div>
+
+            <div>
+              <label for="template">Chat history template</label>
+              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
+            </div>
+            ${GrammarControl()}
+          </fieldset>
+      `
+      );
+
+      const CompletionConfigForm = () => (
+        html`
+          ${PromptControlFieldSet()}
+          <fieldset>${GrammarControl()}</fieldset>
+        `
+      );
+
+      return html`
+        <form>
+          <fieldset class="two">
+            <${UserTemplateResetButton}/>
+            <div>
+              <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
+              <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
+            </div>
+          </fieldset>
+
+          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
+
+          <fieldset class="two">
+            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
+            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
+            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
+            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
+            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
+            ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
+          </fieldset>
+          <details>
+            <summary>More options</summary>
+            <fieldset class="two">
+              ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+              ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
+              ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+            </fieldset>
+            <hr />
+            <fieldset class="three">
+              <div>
+                <label><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> no Mirostat</label>
+                <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
+                <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
+              </div>
+              ${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
+              ${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
+            </fieldset>
+            <fieldset>
+              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
+            </fieldset>
+            <fieldset>
+              ${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
+            </fieldset>
+            <fieldset>
+              <label for="api_key">API Key</label>
+              <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
+            </fieldset>
+          </details>
+        </form>
+      `
+    }
+
+    const probColor = (p) => {
+      const r = Math.floor(192 * (1 - p));
+      const g = Math.floor(192 * p);
+      return `rgba(${r},${g},0,0.3)`;
+    }
+
+    const Probabilities = (params) => {
+      return params.data.map(msg => {
+        const { completion_probabilities } = msg;
+        if (
+          !completion_probabilities ||
+          completion_probabilities.length === 0
+        ) return msg.content
+
+        if (completion_probabilities.length > 1) {
+          // Not for byte pair
+          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
+
+          const splitData = completion_probabilities.map(prob => ({
+            content: prob.content,
+            completion_probabilities: [prob]
+          }))
+          return html`<${Probabilities} data=${splitData} />`
+        }
+
+        const { probs, content } = completion_probabilities[0]
+        const found = probs.find(p => p.tok_str === msg.content)
+        const pColor = found ? probColor(found.prob) : 'transparent'
+
+        const popoverChildren = html`
+          <div class="prob-set">
+            ${probs.map((p, index) => {
+          return html`
+                <div
+                  key=${index}
+                  title=${`prob: ${p.prob}`}
+                  style=${{
+              padding: '0.3em',
+              backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+            }}
+                >
+                  <span>${p.tok_str}: </span>
+                  <span>${Math.floor(p.prob * 100)}%</span>
+                </div>
+              `
+        })}
+          </div>
+        `
+
+        return html`
+          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
+            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
+          </>
+        `
+      });
+    }
+
+    // poor mans markdown replacement
+    const Markdownish = (params) => {
+      const md = params.text
+        .replace(/&/g, '&amp;')
+        .replace(/</g, '&lt;')
+        .replace(/>/g, '&gt;')
+        .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
+        .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
+        .replace(/__(.*?)__/g, '<strong>$1</strong>')
+        .replace(/\*(.*?)\*/g, '<em>$1</em>')
+        .replace(/_(.*?)_/g, '<em>$1</em>')
+        .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
+        .replace(/`(.*?)`/g, '<code>$1</code>')
+        .replace(/\n/gim, '<br />');
+      return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
+    };
+
+    const ModelGenerationInfo = (params) => {
+      if (!llamaStats.value) {
+        return html`<span/>`
+      }
+      return html`
+        <span>
+          ${llamaStats.value.tokens_predicted} predicted, ${llamaStats.value.tokens_cached} cached, ${llamaStats.value.timings.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.timings.predicted_per_second.toFixed(2)} tokens per second
+        </span>
+      `
+    }
+
+    // simple popover impl
+    const Popover = (props) => {
+      const isOpen = useSignal(false);
+      const position = useSignal({ top: '0px', left: '0px' });
+      const buttonRef = useRef(null);
+      const popoverRef = useRef(null);
+
+      const togglePopover = () => {
+        if (buttonRef.current) {
+          const rect = buttonRef.current.getBoundingClientRect();
+          position.value = {
+            top: `${rect.bottom + window.scrollY}px`,
+            left: `${rect.left + window.scrollX}px`,
+          };
+        }
+        isOpen.value = !isOpen.value;
+      };
+
+      const handleClickOutside = (event) => {
+        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
+          isOpen.value = false;
+        }
+      };
+
+      useEffect(() => {
+        document.addEventListener('mousedown', handleClickOutside);
+        return () => {
+          document.removeEventListener('mousedown', handleClickOutside);
+        };
+      }, []);
+
+      return html`
+        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
+        ${isOpen.value && html`
+          <${Portal} into="#portal">
+            <div
+              ref=${popoverRef}
+              class="popover-content"
+              style=${{
+            top: position.value.top,
+            left: position.value.left,
+          }}
+            >
+              ${props.popoverChildren}
+            </div>
+          </${Portal}>
+        `}
+      `;
+    };
+
+    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
+    /** Redirect rendering of descendants into the given CSS selector */
+    class Portal extends Component {
+      componentDidUpdate(props) {
+        for (let i in props) {
+          if (props[i] !== this.props[i]) {
+            return setTimeout(this.renderLayer);
+          }
+        }
+      }
+
+      componentDidMount() {
+        this.isMounted = true;
+        this.renderLayer = this.renderLayer.bind(this);
+        this.renderLayer();
+      }
+
+      componentWillUnmount() {
+        this.renderLayer(false);
+        this.isMounted = false;
+        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
+      }
+
+      findNode(node) {
+        return typeof node === 'string' ? document.querySelector(node) : node;
+      }
+
+      renderLayer(show = true) {
+        if (!this.isMounted) return;
+
+        // clean up old node if moving bases:
+        if (this.props.into !== this.intoPointer) {
+          this.intoPointer = this.props.into;
+          if (this.into && this.remote) {
+            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
+          }
+          this.into = this.findNode(this.props.into);
+        }
+
+        this.remote = render(html`
+          <${PortalProxy} context=${this.context}>
+            ${show && this.props.children || null}
+          </${PortalProxy}>
+        `, this.into, this.remote);
+      }
+
+      render() {
+        return null;
+      }
+    }
+    // high-order component that renders its first child if it exists.
+    // used as a conditional rendering proxy.
+    class PortalProxy extends Component {
+      getChildContext() {
+        return this.props.context;
+      }
+      render({ children }) {
+        return children || null;
+      }
+    }
+
+    function App(props) {
+      useEffect(() => {
+        const query = new URLSearchParams(location.search).get("q");
+        if (query) chat(query);
+      }, []);
+
+      return html`
+        <div class="mode-${session.value.type}">
+          <header>
+            <img src="llama_cpp.png" style="width:100%"/>
+          </header>
+
+          <section id="write">
+            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
+          </section>
+
+          <main id="content">
+            <${chatStarted.value ? ChatLog : ConfigForm} />
+          </main>
+
+
+          <footer>
+            <p><${ModelGenerationInfo} /></p>
+            <p>Powered by <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+          </footer>
+        </div>
+      `;
+    }
+
+    render(h(App), document.querySelector('#container'));
+  </script>
+</head>
+
+<body>
+  <div id="container">
+    <input type="file" id="fileInput" accept="image/*" style="display: none;">
+  </div>
+  <div id="portal"></div>
+</body>
+
+</html>
diff --git a/llama.cpp/tools/server/themes/wild/llama_cpp.png b/llama.cpp/tools/server/themes/wild/llama_cpp.png
new file mode 100644
index 0000000..bad1dc9
Binary files /dev/null and b/llama.cpp/tools/server/themes/wild/llama_cpp.png differ
diff --git a/llama.cpp/tools/server/themes/wild/llamapattern.png b/llama.cpp/tools/server/themes/wild/llamapattern.png
new file mode 100644
index 0000000..2a159ce
Binary files /dev/null and b/llama.cpp/tools/server/themes/wild/llamapattern.png differ
diff --git a/llama.cpp/tools/server/themes/wild/wild.png b/llama.cpp/tools/server/themes/wild/wild.png
new file mode 100644
index 0000000..46ffa0f
Binary files /dev/null and b/llama.cpp/tools/server/themes/wild/wild.png differ
diff --git a/llama.cpp/tools/server/webui/.gitignore b/llama.cpp/tools/server/webui/.gitignore
new file mode 100644
index 0000000..051d884
--- /dev/null
+++ b/llama.cpp/tools/server/webui/.gitignore
@@ -0,0 +1,28 @@
+test-results
+node_modules
+
+# Output
+.output
+.vercel
+.netlify
+.wrangler
+/.svelte-kit
+/build
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Env
+.env
+.env.*
+!.env.example
+!.env.test
+
+# Vite
+vite.config.js.timestamp-*
+vite.config.ts.timestamp-*
+
+*storybook.log
+storybook-static
+*.code-workspace
\ No newline at end of file
diff --git a/llama.cpp/tools/server/webui/.npmrc b/llama.cpp/tools/server/webui/.npmrc
new file mode 100644
index 0000000..b6f27f1
--- /dev/null
+++ b/llama.cpp/tools/server/webui/.npmrc
@@ -0,0 +1 @@
+engine-strict=true
diff --git a/llama.cpp/tools/server/webui/.prettierignore b/llama.cpp/tools/server/webui/.prettierignore
new file mode 100644
index 0000000..7d74fe2
--- /dev/null
+++ b/llama.cpp/tools/server/webui/.prettierignore
@@ -0,0 +1,9 @@
+# Package Managers
+package-lock.json
+pnpm-lock.yaml
+yarn.lock
+bun.lock
+bun.lockb
+
+# Miscellaneous
+/static/
diff --git a/llama.cpp/tools/server/webui/.prettierrc b/llama.cpp/tools/server/webui/.prettierrc
new file mode 100644
index 0000000..8103a0b
--- /dev/null
+++ b/llama.cpp/tools/server/webui/.prettierrc
@@ -0,0 +1,16 @@
+{
+	"useTabs": true,
+	"singleQuote": true,
+	"trailingComma": "none",
+	"printWidth": 100,
+	"plugins": ["prettier-plugin-svelte", "prettier-plugin-tailwindcss"],
+	"overrides": [
+		{
+			"files": "*.svelte",
+			"options": {
+				"parser": "svelte"
+			}
+		}
+	],
+	"tailwindStylesheet": "./src/app.css"
+}
diff --git a/llama.cpp/tools/server/webui/.storybook/ModeWatcherDecorator.svelte b/llama.cpp/tools/server/webui/.storybook/ModeWatcherDecorator.svelte
new file mode 100644
index 0000000..8bded8b
--- /dev/null
+++ b/llama.cpp/tools/server/webui/.storybook/ModeWatcherDecorator.svelte
@@ -0,0 +1,36 @@
+<script lang="ts">
+	import { ModeWatcher } from 'mode-watcher';
+	import { onMount } from 'svelte';
+
+	interface Props {
+		children?: any;
+	}
+
+	let { children }: Props = $props();
+
+	onMount(() => {
+		const root = document.documentElement;
+		const theme = localStorage.getItem('mode-watcher-mode') || 'system';
+
+		if (theme === 'dark') {
+			root.classList.add('dark');
+		} else if (theme === 'light') {
+			root.classList.remove('dark');
+		} else {
+			const prefersDark = window.matchMedia('(prefers-color-scheme: dark)').matches;
+			if (prefersDark) {
+				root.classList.add('dark');
+			} else {
+				root.classList.remove('dark');
+			}
+		}
+	});
+</script>
+
+<ModeWatcher />
+
+{#if children}
+	{@const Component = children}
+
+	<Component />
+{/if}
diff --git a/llama.cpp/tools/server/webui/.storybook/TooltipProviderDecorator.svelte b/llama.cpp/tools/server/webui/.storybook/TooltipProviderDecorator.svelte
new file mode 100644
index 0000000..9aad1ea
--- /dev/null
+++ b/llama.cpp/tools/server/webui/.storybook/TooltipProviderDecorator.svelte
@@ -0,0 +1,13 @@
+<script lang="ts">
+	import * as Tooltip from '../src/lib/components/ui/tooltip';
+
+	interface Props {
+		children: any;
+	}
+
+	let { children }: Props = $props();
+</script>
+
+<Tooltip.Provider>
+	{@render children()}
+</Tooltip.Provider>
diff --git a/llama.cpp/tools/server/webui/.storybook/main.ts b/llama.cpp/tools/server/webui/.storybook/main.ts
new file mode 100644
index 0000000..bfd16fa
--- /dev/null
+++ b/llama.cpp/tools/server/webui/.storybook/main.ts
@@ -0,0 +1,17 @@
+import type { StorybookConfig } from '@storybook/sveltekit';
+
+const config: StorybookConfig = {
+	stories: ['../tests/stories/**/*.mdx', '../tests/stories/**/*.stories.@(js|ts|svelte)'],
+	addons: [
+		'@storybook/addon-svelte-csf',
+		'@chromatic-com/storybook',
+		'@storybook/addon-docs',
+		'@storybook/addon-a11y',
+		'@storybook/addon-vitest'
+	],
+	framework: {
+		name: '@storybook/sveltekit',
+		options: {}
+	}
+};
+export default config;
diff --git a/llama.cpp/tools/server/webui/.storybook/preview.ts b/llama.cpp/tools/server/webui/.storybook/preview.ts
new file mode 100644
index 0000000..8d530e4
--- /dev/null
+++ b/llama.cpp/tools/server/webui/.storybook/preview.ts
@@ -0,0 +1,42 @@
+import type { Preview } from '@storybook/sveltekit';
+import '../src/app.css';
+import ModeWatcherDecorator from './ModeWatcherDecorator.svelte';
+import TooltipProviderDecorator from './TooltipProviderDecorator.svelte';
+
+const preview: Preview = {
+	parameters: {
+		controls: {
+			matchers: {
+				color: /(background|color)$/i,
+				date: /Date$/i
+			}
+		},
+
+		backgrounds: {
+			disable: true
+		},
+
+		a11y: {
+			// 'todo' - show a11y violations in the test UI only
+			// 'error' - fail CI on a11y violations
+			// 'off' - skip a11y checks entirely
+			test: 'todo'
+		}
+	},
+	decorators: [
+		(story) => ({
+			Component: ModeWatcherDecorator,
+			props: {
+				children: story
+			}
+		}),
+		(story) => ({
+			Component: TooltipProviderDecorator,
+			props: {
+				children: story
+			}
+		})
+	]
+};
+
+export default preview;
diff --git a/llama.cpp/tools/server/webui/.storybook/vitest.setup.ts b/llama.cpp/tools/server/webui/.storybook/vitest.setup.ts
new file mode 100644
index 0000000..1471572
--- /dev/null
+++ b/llama.cpp/tools/server/webui/.storybook/vitest.setup.ts
@@ -0,0 +1,12 @@
+import * as a11yAddonAnnotations from '@storybook/addon-a11y/preview';
+import { setProjectAnnotations } from '@storybook/sveltekit';
+import * as previewAnnotations from './preview';
+import { beforeAll } from 'vitest';
+
+const project = setProjectAnnotations([a11yAddonAnnotations, previewAnnotations]);
+
+beforeAll(async () => {
+	if (project.beforeAll) {
+		await project.beforeAll();
+	}
+});
diff --git a/llama.cpp/tools/server/webui/README.md b/llama.cpp/tools/server/webui/README.md
new file mode 100644
index 0000000..98b01fd
--- /dev/null
+++ b/llama.cpp/tools/server/webui/README.md
@@ -0,0 +1,687 @@
+# llama.cpp Web UI
+
+A modern, feature-rich web interface for llama.cpp built with SvelteKit. This UI provides an intuitive chat interface with advanced file handling, conversation management, and comprehensive model interaction capabilities.
+
+The WebUI supports two server operation modes:
+
+- **MODEL mode** - Single model operation (standard llama-server)
+- **ROUTER mode** - Multi-model operation with dynamic model loading/unloading
+
+---
+
+## Table of Contents
+
+- [Features](#features)
+- [Getting Started](#getting-started)
+- [Tech Stack](#tech-stack)
+- [Build Pipeline](#build-pipeline)
+- [Architecture](#architecture)
+- [Data Flows](#data-flows)
+- [Architectural Patterns](#architectural-patterns)
+- [Testing](#testing)
+
+---
+
+## Features
+
+### Chat Interface
+
+- **Streaming responses** with real-time updates
+- **Reasoning content** - Support for models with thinking/reasoning blocks
+- **Dark/light theme** with system preference detection
+- **Responsive design** for desktop and mobile
+
+### File Attachments
+
+- **Images** - JPEG, PNG, GIF, WebP, SVG (with PNG conversion)
+- **Documents** - PDF (text extraction or image conversion for vision models)
+- **Audio** - MP3, WAV for audio-capable models
+- **Text files** - Source code, markdown, and other text formats
+- **Drag-and-drop** and paste support with rich previews
+
+### Conversation Management
+
+- **Branching** - Branch messages conversations at any point by editing messages or regenerating responses, navigate between branches
+- **Regeneration** - Regenerate responses with optional model switching (ROUTER mode)
+- **Import/Export** - JSON format for backup and sharing
+- **Search** - Find conversations by title or content
+
+### Advanced Rendering
+
+- **Syntax highlighting** - Code blocks with language detection
+- **Math formulas** - KaTeX rendering for LaTeX expressions
+- **Markdown** - Full GFM support with tables, lists, and more
+
+### Multi-Model Support (ROUTER mode)
+
+- **Model selector** with Loaded/Available groups
+- **Automatic loading** - Models load on selection
+- **Modality validation** - Prevents sending images to non-vision models
+- **LRU unloading** - Server auto-manages model cache
+
+### Keyboard Shortcuts
+
+| Shortcut           | Action               |
+| ------------------ | -------------------- |
+| `Shift+Ctrl/Cmd+O` | New chat             |
+| `Shift+Ctrl/Cmd+E` | Edit conversation    |
+| `Shift+Ctrl/Cmd+D` | Delete conversation  |
+| `Ctrl/Cmd+K`       | Search conversations |
+| `Ctrl/Cmd+B`       | Toggle sidebar       |
+
+### Developer Experience
+
+- **Request tracking** - Monitor token generation with `/slots` endpoint
+- **Storybook** - Component library with visual testing
+- **Hot reload** - Instant updates during development
+
+---
+
+## Getting Started
+
+### Prerequisites
+
+- **Node.js** 18+ (20+ recommended)
+- **npm** 9+
+- **llama-server** running locally (for API access)
+
+### 1. Install Dependencies
+
+```bash
+cd tools/server/webui
+npm install
+```
+
+### 2. Start llama-server
+
+In a separate terminal, start the backend server:
+
+```bash
+# Single model (MODEL mode)
+./llama-server -m model.gguf
+
+# Multi-model (ROUTER mode)
+./llama-server --model-store /path/to/models
+```
+
+### 3. Start Development Servers
+
+```bash
+npm run dev
+```
+
+This starts:
+
+- **Vite dev server** at `http://localhost:5173` - The main WebUI
+- **Storybook** at `http://localhost:6006` - Component documentation
+
+The Vite dev server proxies API requests to `http://localhost:8080` (default llama-server port):
+
+```typescript
+// vite.config.ts proxy configuration
+proxy: {
+  '/v1': 'http://localhost:8080',
+  '/props': 'http://localhost:8080',
+  '/slots': 'http://localhost:8080',
+  '/models': 'http://localhost:8080'
+}
+```
+
+### Development Workflow
+
+1. Open `http://localhost:5173` in your browser
+2. Make changes to `.svelte`, `.ts`, or `.css` files
+3. Changes hot-reload instantly
+4. Use Storybook at `http://localhost:6006` for isolated component development
+
+---
+
+## Tech Stack
+
+| Layer             | Technology                      | Purpose                                                  |
+| ----------------- | ------------------------------- | -------------------------------------------------------- |
+| **Framework**     | SvelteKit + Svelte 5            | Reactive UI with runes (`$state`, `$derived`, `$effect`) |
+| **UI Components** | shadcn-svelte + bits-ui         | Accessible, customizable component library               |
+| **Styling**       | TailwindCSS 4                   | Utility-first CSS with design tokens                     |
+| **Database**      | IndexedDB (Dexie)               | Client-side storage for conversations and messages       |
+| **Build**         | Vite                            | Fast bundling with static adapter                        |
+| **Testing**       | Playwright + Vitest + Storybook | E2E, unit, and visual testing                            |
+| **Markdown**      | remark + rehype                 | Markdown processing with KaTeX and syntax highlighting   |
+
+### Key Dependencies
+
+```json
+{
+	"svelte": "^5.0.0",
+	"bits-ui": "^2.8.11",
+	"dexie": "^4.0.11",
+	"pdfjs-dist": "^5.4.54",
+	"highlight.js": "^11.11.1",
+	"rehype-katex": "^7.0.1"
+}
+```
+
+---
+
+## Build Pipeline
+
+### Development Build
+
+```bash
+npm run dev
+```
+
+Runs Vite in development mode with:
+
+- Hot Module Replacement (HMR)
+- Source maps
+- Proxy to llama-server
+
+### Production Build
+
+```bash
+npm run build
+```
+
+The build process:
+
+1. **Vite Build** - Bundles all TypeScript, Svelte, and CSS
+2. **Static Adapter** - Outputs to `../public` (llama-server's static file directory)
+3. **Post-Build Script** - Cleans up intermediate files
+4. **Custom Plugin** - Creates `index.html.gz` with:
+   - Inlined favicon as base64
+   - GZIP compression (level 9)
+   - Deterministic output (zeroed timestamps)
+
+```text
+tools/server/webui/        →  build  →  tools/server/public/
+├── src/                                 ├── index.html.gz  (served by llama-server)
+├── static/                              └── (favicon inlined)
+└── ...
+```
+
+### SvelteKit Configuration
+
+```javascript
+// svelte.config.js
+adapter: adapter({
+  pages: '../public',      // Output directory
+  assets: '../public',     // Static assets
+  fallback: 'index.html',  // SPA fallback
+  strict: true
+}),
+output: {
+  bundleStrategy: 'inline' // Single-file bundle
+}
+```
+
+### Integration with llama-server
+
+The WebUI is embedded directly into the llama-server binary:
+
+1. `npm run build` outputs `index.html.gz` to `tools/server/public/`
+2. llama-server compiles this into the binary at build time
+3. When accessing `/`, llama-server serves the gzipped HTML
+4. All assets are inlined (CSS, JS, fonts, favicon)
+
+This results in a **single portable binary** with the full WebUI included.
+
+---
+
+## Architecture
+
+The WebUI follows a layered architecture with unidirectional data flow:
+
+```text
+Routes → Components → Hooks → Stores → Services → Storage/API
+```
+
+### High-Level Architecture
+
+See: [`docs/architecture/high-level-architecture-simplified.md`](docs/architecture/high-level-architecture-simplified.md)
+
+```mermaid
+flowchart TB
+    subgraph Routes["📍 Routes"]
+        R1["/ (Welcome)"]
+        R2["/chat/[id]"]
+        RL["+layout.svelte"]
+    end
+
+    subgraph Components["🧩 Components"]
+        C_Sidebar["ChatSidebar"]
+        C_Screen["ChatScreen"]
+        C_Form["ChatForm"]
+        C_Messages["ChatMessages"]
+        C_ModelsSelector["ModelsSelector"]
+        C_Settings["ChatSettings"]
+    end
+
+    subgraph Stores["🗄️ Stores"]
+        S1["chatStore"]
+        S2["conversationsStore"]
+        S3["modelsStore"]
+        S4["serverStore"]
+        S5["settingsStore"]
+    end
+
+    subgraph Services["⚙️ Services"]
+        SV1["ChatService"]
+        SV2["ModelsService"]
+        SV3["PropsService"]
+        SV4["DatabaseService"]
+    end
+
+    subgraph Storage["💾 Storage"]
+        ST1["IndexedDB"]
+        ST2["LocalStorage"]
+    end
+
+    subgraph APIs["🌐 llama-server"]
+        API1["/v1/chat/completions"]
+        API2["/props"]
+        API3["/models/*"]
+    end
+
+    R1 & R2 --> C_Screen
+    RL --> C_Sidebar
+    C_Screen --> C_Form & C_Messages & C_Settings
+    C_Screen --> S1 & S2
+    C_ModelsSelector --> S3 & S4
+    S1 --> SV1 & SV4
+    S3 --> SV2 & SV3
+    SV4 --> ST1
+    SV1 --> API1
+    SV2 --> API3
+    SV3 --> API2
+```
+
+### Layer Breakdown
+
+#### Routes (`src/routes/`)
+
+- **`/`** - Welcome screen, creates new conversation
+- **`/chat/[id]`** - Active chat interface
+- **`+layout.svelte`** - Sidebar, navigation, global initialization
+
+#### Components (`src/lib/components/`)
+
+Components are organized in `app/` (application-specific) and `ui/` (shadcn-svelte primitives).
+
+**Chat Components** (`app/chat/`):
+
+| Component          | Responsibility                                                              |
+| ------------------ | --------------------------------------------------------------------------- |
+| `ChatScreen/`      | Main chat container, coordinates message list, input form, and attachments  |
+| `ChatForm/`        | Message input textarea with file upload, paste handling, keyboard shortcuts |
+| `ChatMessages/`    | Message list with branch navigation, regenerate/continue/edit actions       |
+| `ChatAttachments/` | File attachment previews, drag-and-drop, PDF/image/audio handling           |
+| `ChatSettings/`    | Parameter sliders (temperature, top-p, etc.) with server default sync       |
+| `ChatSidebar/`     | Conversation list, search, import/export, navigation                        |
+
+**Dialog Components** (`app/dialogs/`):
+
+| Component                       | Responsibility                                           |
+| ------------------------------- | -------------------------------------------------------- |
+| `DialogChatSettings`            | Full-screen settings configuration                       |
+| `DialogModelInformation`        | Model details (context size, modalities, parallel slots) |
+| `DialogChatAttachmentPreview`   | Full preview for images, PDFs (text or page view), code  |
+| `DialogConfirmation`            | Generic confirmation for destructive actions             |
+| `DialogConversationTitleUpdate` | Edit conversation title                                  |
+
+**Server/Model Components** (`app/server/`, `app/models/`):
+
+| Component           | Responsibility                                            |
+| ------------------- | --------------------------------------------------------- |
+| `ServerErrorSplash` | Error display when server is unreachable                  |
+| `ModelsSelector`    | Model dropdown with Loaded/Available groups (ROUTER mode) |
+
+**Shared UI Components** (`app/misc/`):
+
+| Component                        | Responsibility                                                   |
+| -------------------------------- | ---------------------------------------------------------------- |
+| `MarkdownContent`                | Markdown rendering with KaTeX, syntax highlighting, copy buttons |
+| `SyntaxHighlightedCode`          | Code blocks with language detection and highlighting             |
+| `ActionButton`, `ActionDropdown` | Reusable action buttons and menus                                |
+| `BadgeModality`, `BadgeInfo`     | Status and capability badges                                     |
+
+#### Hooks (`src/lib/hooks/`)
+
+- **`useModelChangeValidation`** - Validates model switch against conversation modalities
+- **`useProcessingState`** - Tracks streaming progress and token generation
+
+#### Stores (`src/lib/stores/`)
+
+| Store                | Responsibility                                            |
+| -------------------- | --------------------------------------------------------- |
+| `chatStore`          | Message sending, streaming, abort control, error handling |
+| `conversationsStore` | CRUD for conversations, message branching, navigation     |
+| `modelsStore`        | Model list, selection, loading/unloading (ROUTER)         |
+| `serverStore`        | Server properties, role detection, modalities             |
+| `settingsStore`      | User preferences, parameter sync with server defaults     |
+
+#### Services (`src/lib/services/`)
+
+| Service                | Responsibility                                  |
+| ---------------------- | ----------------------------------------------- |
+| `ChatService`          | API calls to`/v1/chat/completions`, SSE parsing |
+| `ModelsService`        | `/models`, `/models/load`, `/models/unload`     |
+| `PropsService`         | `/props`, `/props?model=`                       |
+| `DatabaseService`      | IndexedDB operations via Dexie                  |
+| `ParameterSyncService` | Syncs settings with server defaults             |
+
+---
+
+## Data Flows
+
+### MODEL Mode (Single Model)
+
+See: [`docs/flows/data-flow-simplified-model-mode.md`](docs/flows/data-flow-simplified-model-mode.md)
+
+```mermaid
+sequenceDiagram
+    participant User
+    participant UI
+    participant Stores
+    participant DB as IndexedDB
+    participant API as llama-server
+
+    Note over User,API: Initialization
+    UI->>Stores: initialize()
+    Stores->>DB: load conversations
+    Stores->>API: GET /props
+    API-->>Stores: server config
+    Stores->>API: GET /v1/models
+    API-->>Stores: single model (auto-selected)
+
+    Note over User,API: Chat Flow
+    User->>UI: send message
+    Stores->>DB: save user message
+    Stores->>API: POST /v1/chat/completions (stream)
+    loop streaming
+        API-->>Stores: SSE chunks
+        Stores-->>UI: reactive update
+    end
+    Stores->>DB: save assistant message
+```
+
+### ROUTER Mode (Multi-Model)
+
+See: [`docs/flows/data-flow-simplified-router-mode.md`](docs/flows/data-flow-simplified-router-mode.md)
+
+```mermaid
+sequenceDiagram
+    participant User
+    participant UI
+    participant Stores
+    participant API as llama-server
+
+    Note over User,API: Initialization
+    Stores->>API: GET /props
+    API-->>Stores: {role: "router"}
+    Stores->>API: GET /models
+    API-->>Stores: models[] with status
+
+    Note over User,API: Model Selection
+    User->>UI: select model
+    alt model not loaded
+        Stores->>API: POST /models/load
+        loop poll status
+            Stores->>API: GET /models
+        end
+        Stores->>API: GET /props?model=X
+    end
+    Stores->>Stores: validate modalities
+
+    Note over User,API: Chat Flow
+    Stores->>API: POST /v1/chat/completions {model: X}
+    loop streaming
+        API-->>Stores: SSE chunks + model info
+    end
+```
+
+### Detailed Flow Diagrams
+
+| Flow          | Description                                | File                                                        |
+| ------------- | ------------------------------------------ | ----------------------------------------------------------- |
+| Chat          | Message lifecycle, streaming, regeneration | [`chat-flow.md`](docs/flows/chat-flow.md)                   |
+| Models        | Loading, unloading, modality caching       | [`models-flow.md`](docs/flows/models-flow.md)               |
+| Server        | Props fetching, role detection             | [`server-flow.md`](docs/flows/server-flow.md)               |
+| Conversations | CRUD, branching, import/export             | [`conversations-flow.md`](docs/flows/conversations-flow.md) |
+| Database      | IndexedDB schema, operations               | [`database-flow.md`](docs/flows/database-flow.md)           |
+| Settings      | Parameter sync, user overrides             | [`settings-flow.md`](docs/flows/settings-flow.md)           |
+
+---
+
+## Architectural Patterns
+
+### 1. Reactive State with Svelte 5 Runes
+
+All stores use Svelte 5's fine-grained reactivity:
+
+```typescript
+// Store with reactive state
+class ChatStore {
+	#isLoading = $state(false);
+	#currentResponse = $state('');
+
+	// Derived values auto-update
+	get isStreaming() {
+		return $derived(this.#isLoading && this.#currentResponse.length > 0);
+	}
+}
+
+// Exported reactive accessors
+export const isLoading = () => chatStore.isLoading;
+export const currentResponse = () => chatStore.currentResponse;
+```
+
+### 2. Unidirectional Data Flow
+
+Data flows in one direction, making state predictable:
+
+```mermaid
+flowchart LR
+    subgraph UI["UI Layer"]
+        A[User Action] --> B[Component]
+    end
+
+    subgraph State["State Layer"]
+        B --> C[Store Method]
+        C --> D[State Update]
+    end
+
+    subgraph IO["I/O Layer"]
+        C --> E[Service]
+        E --> F[API / IndexedDB]
+        F -.->|Response| D
+    end
+
+    D -->|Reactive| B
+```
+
+Components dispatch actions to stores, stores coordinate with services for I/O, and state updates reactively propagate back to the UI.
+
+### 3. Per-Conversation State
+
+Enables concurrent streaming across multiple conversations:
+
+```typescript
+class ChatStore {
+	chatLoadingStates = new Map<string, boolean>();
+	chatStreamingStates = new Map<string, { response: string; messageId: string }>();
+	abortControllers = new Map<string, AbortController>();
+}
+```
+
+### 4. Message Branching with Tree Structure
+
+Conversations are stored as a tree, not a linear list:
+
+```typescript
+interface DatabaseMessage {
+	id: string;
+	parent: string | null; // Points to parent message
+	children: string[]; // List of child message IDs
+	// ...
+}
+
+interface DatabaseConversation {
+	currentNode: string; // Currently viewed branch tip
+	// ...
+}
+```
+
+Navigation between branches updates `currentNode` without losing history.
+
+### 5. Layered Service Architecture
+
+Stores handle state; services handle I/O:
+
+```text
+┌─────────────────┐
+│     Stores      │  Business logic, state management
+├─────────────────┤
+│    Services     │  API calls, database operations
+├─────────────────┤
+│   Storage/API   │  IndexedDB, LocalStorage, HTTP
+└─────────────────┘
+```
+
+### 6. Server Role Abstraction
+
+Single codebase handles both MODEL and ROUTER modes:
+
+```typescript
+// serverStore.ts
+get isRouterMode() {
+  return this.role === ServerRole.ROUTER;
+}
+
+// Components conditionally render based on mode
+{#if isRouterMode()}
+  <ModelsSelector />
+{/if}
+```
+
+### 7. Modality Validation
+
+Prevents sending attachments to incompatible models:
+
+```typescript
+// useModelChangeValidation hook
+const validate = (modelId: string) => {
+	const modelModalities = modelsStore.getModelModalities(modelId);
+	const conversationModalities = conversationsStore.usedModalities;
+
+	// Check if model supports all used modalities
+	if (conversationModalities.hasImages && !modelModalities.vision) {
+		return { valid: false, reason: 'Model does not support images' };
+	}
+	// ...
+};
+```
+
+### 8. Persistent Storage Strategy
+
+Data is persisted across sessions using two storage mechanisms:
+
+```mermaid
+flowchart TB
+    subgraph Browser["Browser Storage"]
+        subgraph IDB["IndexedDB (Dexie)"]
+            C[Conversations]
+            M[Messages]
+        end
+        subgraph LS["LocalStorage"]
+            S[Settings Config]
+            O[User Overrides]
+            T[Theme Preference]
+        end
+    end
+
+    subgraph Stores["Svelte Stores"]
+        CS[conversationsStore] --> C
+        CS --> M
+        SS[settingsStore] --> S
+        SS --> O
+        SS --> T
+    end
+```
+
+- **IndexedDB**: Conversations and messages (large, structured data)
+- **LocalStorage**: Settings, user parameter overrides, theme (small key-value data)
+- **Memory only**: Server props, model list (fetched fresh on each session)
+
+---
+
+## Testing
+
+### Test Types
+
+| Type          | Tool               | Location         | Command             |
+| ------------- | ------------------ | ---------------- | ------------------- |
+| **Unit**      | Vitest             | `tests/unit/`    | `npm run test:unit` |
+| **UI/Visual** | Storybook + Vitest | `tests/stories/` | `npm run test:ui`   |
+| **E2E**       | Playwright         | `tests/e2e/`     | `npm run test:e2e`  |
+| **Client**    | Vitest             | `tests/client/`. | `npm run test:unit` |
+
+### Running Tests
+
+```bash
+# All tests
+npm run test
+
+# Individual test suites
+npm run test:e2e      # End-to-end (requires llama-server)
+npm run test:client   # Client-side unit tests
+npm run test:server   # Server-side unit tests
+npm run test:ui       # Storybook visual tests
+```
+
+### Storybook Development
+
+```bash
+npm run storybook     # Start Storybook dev server on :6006
+npm run build-storybook  # Build static Storybook
+```
+
+### Linting and Formatting
+
+```bash
+npm run lint          # Check code style
+npm run format        # Auto-format with Prettier
+npm run check         # TypeScript type checking
+```
+
+---
+
+## Project Structure
+
+```text
+tools/server/webui/
+├── src/
+│   ├── lib/
+│   │   ├── components/   # UI components (app/, ui/)
+│   │   ├── hooks/        # Svelte hooks
+│   │   ├── stores/       # State management
+│   │   ├── services/     # API and database services
+│   │   ├── types/        # TypeScript interfaces
+│   │   └── utils/        # Utility functions
+│   ├── routes/           # SvelteKit routes
+│   └── styles/           # Global styles
+├── static/               # Static assets
+├── tests/                # Test files
+├── docs/                 # Architecture diagrams
+│   ├── architecture/     # High-level architecture
+│   └── flows/            # Feature-specific flows
+└── .storybook/           # Storybook configuration
+```
+
+---
+
+## Related Documentation
+
+- [llama.cpp Server README](../README.md) - Full server documentation
+- [Multimodal Documentation](../../../docs/multimodal.md) - Image and audio support
+- [Function Calling](../../../docs/function-calling.md) - Tool use capabilities
diff --git a/llama.cpp/tools/server/webui/components.json b/llama.cpp/tools/server/webui/components.json
new file mode 100644
index 0000000..224bd70
--- /dev/null
+++ b/llama.cpp/tools/server/webui/components.json
@@ -0,0 +1,16 @@
+{
+	"$schema": "https://shadcn-svelte.com/schema.json",
+	"tailwind": {
+		"css": "src/app.css",
+		"baseColor": "neutral"
+	},
+	"aliases": {
+		"components": "$lib/components",
+		"utils": "$lib/components/ui/utils",
+		"ui": "$lib/components/ui",
+		"hooks": "$lib/hooks",
+		"lib": "$lib"
+	},
+	"typescript": true,
+	"registry": "https://shadcn-svelte.com/registry"
+}
diff --git a/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture-simplified.md b/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture-simplified.md
new file mode 100644
index 0000000..a6cb1e9
--- /dev/null
+++ b/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture-simplified.md
@@ -0,0 +1,106 @@
+```mermaid
+flowchart TB
+    subgraph Routes["📍 Routes"]
+        R1["/ (Welcome)"]
+        R2["/chat/[id]"]
+        RL["+layout.svelte"]
+    end
+
+    subgraph Components["🧩 Components"]
+        C_Sidebar["ChatSidebar"]
+        C_Screen["ChatScreen"]
+        C_Form["ChatForm"]
+        C_Messages["ChatMessages"]
+        C_Message["ChatMessage"]
+        C_MessageEditForm["ChatMessageEditForm"]
+        C_ModelsSelector["ModelsSelector"]
+        C_Settings["ChatSettings"]
+    end
+
+    subgraph Hooks["🪝 Hooks"]
+        H1["useModelChangeValidation"]
+        H2["useProcessingState"]
+    end
+
+    subgraph Stores["🗄️ Stores"]
+        S1["chatStore<br/><i>Chat interactions & streaming</i>"]
+        S2["conversationsStore<br/><i>Conversation data & messages</i>"]
+        S3["modelsStore<br/><i>Model selection & loading</i>"]
+        S4["serverStore<br/><i>Server props & role detection</i>"]
+        S5["settingsStore<br/><i>User configuration</i>"]
+    end
+
+    subgraph Services["⚙️ Services"]
+        SV1["ChatService"]
+        SV2["ModelsService"]
+        SV3["PropsService"]
+        SV4["DatabaseService"]
+        SV5["ParameterSyncService"]
+    end
+
+    subgraph Storage["💾 Storage"]
+        ST1["IndexedDB<br/><i>conversations, messages</i>"]
+        ST2["LocalStorage<br/><i>config, userOverrides</i>"]
+    end
+
+    subgraph APIs["🌐 llama-server API"]
+        API1["/v1/chat/completions"]
+        API2["/props"]
+        API3["/models/*"]
+        API4["/v1/models"]
+    end
+
+    %% Routes → Components
+    R1 & R2 --> C_Screen
+    RL --> C_Sidebar
+
+    %% Component hierarchy
+    C_Screen --> C_Form & C_Messages & C_Settings
+    C_Messages --> C_Message
+    C_Message --> C_MessageEditForm
+    C_Form & C_MessageEditForm --> C_ModelsSelector
+
+    %% Components → Hooks → Stores
+    C_Form & C_Messages --> H1 & H2
+    H1 --> S3 & S4
+    H2 --> S1 & S5
+
+    %% Components → Stores
+    C_Screen --> S1 & S2
+    C_Sidebar --> S2
+    C_ModelsSelector --> S3 & S4
+    C_Settings --> S5
+
+    %% Stores → Services
+    S1 --> SV1 & SV4
+    S2 --> SV4
+    S3 --> SV2 & SV3
+    S4 --> SV3
+    S5 --> SV5
+
+    %% Services → Storage
+    SV4 --> ST1
+    SV5 --> ST2
+
+    %% Services → APIs
+    SV1 --> API1
+    SV2 --> API3 & API4
+    SV3 --> API2
+
+    %% Styling
+    classDef routeStyle fill:#e1f5fe,stroke:#01579b,stroke-width:2px
+    classDef componentStyle fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+    classDef hookStyle fill:#fff8e1,stroke:#ff8f00,stroke-width:2px
+    classDef storeStyle fill:#fff3e0,stroke:#e65100,stroke-width:2px
+    classDef serviceStyle fill:#e8f5e9,stroke:#2e7d32,stroke-width:2px
+    classDef storageStyle fill:#fce4ec,stroke:#c2185b,stroke-width:2px
+    classDef apiStyle fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
+
+    class R1,R2,RL routeStyle
+    class C_Sidebar,C_Screen,C_Form,C_Messages,C_Message,C_MessageEditForm,C_ModelsSelector,C_Settings componentStyle
+    class H1,H2 hookStyle
+    class S1,S2,S3,S4,S5 storeStyle
+    class SV1,SV2,SV3,SV4,SV5 serviceStyle
+    class ST1,ST2 storageStyle
+    class API1,API2,API3,API4 apiStyle
+```
diff --git a/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture.md b/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture.md
new file mode 100644
index 0000000..c5ec4d6
--- /dev/null
+++ b/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture.md
@@ -0,0 +1,279 @@
+```mermaid
+flowchart TB
+subgraph Routes["📍 Routes"]
+R1["/ (+page.svelte)"]
+R2["/chat/[id]"]
+RL["+layout.svelte"]
+end
+
+    subgraph Components["🧩 Components"]
+        direction TB
+        subgraph LayoutComponents["Layout"]
+            C_Sidebar["ChatSidebar"]
+            C_Screen["ChatScreen"]
+        end
+        subgraph ChatUIComponents["Chat UI"]
+            C_Form["ChatForm"]
+            C_Messages["ChatMessages"]
+            C_Message["ChatMessage"]
+            C_MessageUser["ChatMessageUser"]
+            C_MessageEditForm["ChatMessageEditForm"]
+            C_Attach["ChatAttachments"]
+            C_ModelsSelector["ModelsSelector"]
+            C_Settings["ChatSettings"]
+        end
+    end
+
+    subgraph Hooks["🪝 Hooks"]
+        H1["useModelChangeValidation"]
+        H2["useProcessingState"]
+        H3["isMobile"]
+    end
+
+    subgraph Stores["🗄️ Stores"]
+        direction TB
+        subgraph S1["chatStore"]
+            S1State["<b>State:</b><br/>isLoading, currentResponse<br/>errorDialogState<br/>activeProcessingState<br/>chatLoadingStates<br/>chatStreamingStates<br/>abortControllers<br/>processingStates<br/>activeConversationId<br/>isStreamingActive"]
+            S1LoadState["<b>Loading State:</b><br/>setChatLoading()<br/>isChatLoading()<br/>syncLoadingStateForChat()<br/>clearUIState()<br/>isChatLoadingPublic()<br/>getAllLoadingChats()<br/>getAllStreamingChats()"]
+            S1ProcState["<b>Processing State:</b><br/>setActiveProcessingConversation()<br/>getProcessingState()<br/>clearProcessingState()<br/>getActiveProcessingState()<br/>updateProcessingStateFromTimings()<br/>getCurrentProcessingStateSync()<br/>restoreProcessingStateFromMessages()"]
+            S1Stream["<b>Streaming:</b><br/>streamChatCompletion()<br/>startStreaming()<br/>stopStreaming()<br/>stopGeneration()<br/>isStreaming()"]
+            S1Error["<b>Error Handling:</b><br/>showErrorDialog()<br/>dismissErrorDialog()<br/>isAbortError()"]
+            S1Msg["<b>Message Operations:</b><br/>addMessage()<br/>sendMessage()<br/>updateMessage()<br/>deleteMessage()<br/>getDeletionInfo()"]
+            S1Regen["<b>Regeneration:</b><br/>regenerateMessage()<br/>regenerateMessageWithBranching()<br/>continueAssistantMessage()"]
+            S1Edit["<b>Editing:</b><br/>editAssistantMessage()<br/>editUserMessagePreserveResponses()<br/>editMessageWithBranching()<br/>clearEditMode()<br/>isEditModeActive()<br/>getAddFilesHandler()<br/>setEditModeActive()"]
+            S1Utils["<b>Utilities:</b><br/>getApiOptions()<br/>parseTimingData()<br/>getOrCreateAbortController()<br/>getConversationModel()"]
+        end
+        subgraph S2["conversationsStore"]
+            S2State["<b>State:</b><br/>conversations<br/>activeConversation<br/>activeMessages<br/>usedModalities<br/>isInitialized<br/>titleUpdateConfirmationCallback"]
+            S2Modal["<b>Modalities:</b><br/>getModalitiesUpToMessage()<br/>calculateModalitiesFromMessages()"]
+            S2Lifecycle["<b>Lifecycle:</b><br/>initialize()<br/>loadConversations()<br/>clearActiveConversation()"]
+            S2ConvCRUD["<b>Conversation CRUD:</b><br/>createConversation()<br/>loadConversation()<br/>deleteConversation()<br/>updateConversationName()<br/>updateConversationTitleWithConfirmation()"]
+            S2MsgMgmt["<b>Message Management:</b><br/>refreshActiveMessages()<br/>addMessageToActive()<br/>updateMessageAtIndex()<br/>findMessageIndex()<br/>sliceActiveMessages()<br/>removeMessageAtIndex()<br/>getConversationMessages()"]
+            S2Nav["<b>Navigation:</b><br/>navigateToSibling()<br/>updateCurrentNode()<br/>updateConversationTimestamp()"]
+            S2Export["<b>Import/Export:</b><br/>downloadConversation()<br/>exportAllConversations()<br/>importConversations()<br/>triggerDownload()"]
+            S2Utils["<b>Utilities:</b><br/>setTitleUpdateConfirmationCallback()"]
+        end
+        subgraph S3["modelsStore"]
+            S3State["<b>State:</b><br/>models, routerModels<br/>selectedModelId<br/>selectedModelName<br/>loading, updating, error<br/>modelLoadingStates<br/>modelPropsCache<br/>modelPropsFetching<br/>propsCacheVersion"]
+            S3Getters["<b>Computed Getters:</b><br/>selectedModel<br/>loadedModelIds<br/>loadingModelIds<br/>singleModelName"]
+            S3Modal["<b>Modalities:</b><br/>getModelModalities()<br/>modelSupportsVision()<br/>modelSupportsAudio()<br/>getModelModalitiesArray()<br/>getModelProps()<br/>updateModelModalities()"]
+            S3Status["<b>Status Queries:</b><br/>isModelLoaded()<br/>isModelOperationInProgress()<br/>getModelStatus()<br/>isModelPropsFetching()"]
+            S3Fetch["<b>Data Fetching:</b><br/>fetch()<br/>fetchRouterModels()<br/>fetchModelProps()<br/>fetchModalitiesForLoadedModels()"]
+            S3Select["<b>Model Selection:</b><br/>selectModelById()<br/>selectModelByName()<br/>clearSelection()<br/>findModelByName()<br/>findModelById()<br/>hasModel()"]
+            S3LoadUnload["<b>Loading/Unloading Models:</b><br/>loadModel()<br/>unloadModel()<br/>ensureModelLoaded()<br/>waitForModelStatus()<br/>pollForModelStatus()"]
+            S3Utils["<b>Utilities:</b><br/>toDisplayName()<br/>clear()"]
+        end
+        subgraph S4["serverStore"]
+            S4State["<b>State:</b><br/>props<br/>loading, error<br/>role<br/>fetchPromise"]
+            S4Getters["<b>Getters:</b><br/>defaultParams<br/>contextSize<br/>isRouterMode<br/>isModelMode"]
+            S4Data["<b>Data Handling:</b><br/>fetch()<br/>getErrorMessage()<br/>clear()"]
+            S4Utils["<b>Utilities:</b><br/>detectRole()"]
+        end
+        subgraph S5["settingsStore"]
+            S5State["<b>State:</b><br/>config<br/>theme<br/>isInitialized<br/>userOverrides"]
+            S5Lifecycle["<b>Lifecycle:</b><br/>initialize()<br/>loadConfig()<br/>saveConfig()<br/>loadTheme()<br/>saveTheme()"]
+            S5Update["<b>Config Updates:</b><br/>updateConfig()<br/>updateMultipleConfig()<br/>updateTheme()"]
+            S5Reset["<b>Reset:</b><br/>resetConfig()<br/>resetTheme()<br/>resetAll()<br/>resetParameterToServerDefault()"]
+            S5Sync["<b>Server Sync:</b><br/>syncWithServerDefaults()<br/>forceSyncWithServerDefaults()"]
+            S5Utils["<b>Utilities:</b><br/>getConfig()<br/>getAllConfig()<br/>getParameterInfo()<br/>getParameterDiff()<br/>getServerDefaults()<br/>clearAllUserOverrides()"]
+        end
+
+        subgraph ReactiveExports["⚡ Reactive Exports"]
+            direction LR
+            subgraph ChatExports["chatStore"]
+                RE1["isLoading()"]
+                RE2["currentResponse()"]
+                RE3["errorDialog()"]
+                RE4["activeProcessingState()"]
+                RE5["isChatStreaming()"]
+                RE6["isChatLoading()"]
+                RE7["getChatStreaming()"]
+                RE8["getAllLoadingChats()"]
+                RE9["getAllStreamingChats()"]
+                RE9a["isEditModeActive()"]
+                RE9b["getAddFilesHandler()"]
+                RE9c["setEditModeActive()"]
+                RE9d["clearEditMode()"]
+            end
+            subgraph ConvExports["conversationsStore"]
+                RE10["conversations()"]
+                RE11["activeConversation()"]
+                RE12["activeMessages()"]
+                RE13["isConversationsInitialized()"]
+                RE14["usedModalities()"]
+            end
+            subgraph ModelsExports["modelsStore"]
+                RE15["modelOptions()"]
+                RE16["routerModels()"]
+                RE17["modelsLoading()"]
+                RE18["modelsUpdating()"]
+                RE19["modelsError()"]
+                RE20["selectedModelId()"]
+                RE21["selectedModelName()"]
+                RE22["selectedModelOption()"]
+                RE23["loadedModelIds()"]
+                RE24["loadingModelIds()"]
+                RE25["propsCacheVersion()"]
+                RE26["singleModelName()"]
+            end
+            subgraph ServerExports["serverStore"]
+                RE27["serverProps()"]
+                RE28["serverLoading()"]
+                RE29["serverError()"]
+                RE30["serverRole()"]
+                RE31["defaultParams()"]
+                RE32["contextSize()"]
+                RE33["isRouterMode()"]
+                RE34["isModelMode()"]
+            end
+            subgraph SettingsExports["settingsStore"]
+                RE35["config()"]
+                RE36["theme()"]
+                RE37["isInitialized()"]
+            end
+        end
+    end
+
+    subgraph Services["⚙️ Services"]
+        direction TB
+        subgraph SV1["ChatService"]
+            SV1Msg["<b>Messaging:</b><br/>sendMessage()"]
+            SV1Stream["<b>Streaming:</b><br/>handleStreamResponse()<br/>parseSSEChunk()"]
+            SV1Convert["<b>Conversion:</b><br/>convertMessageToChatData()<br/>convertExtraToApiFormat()"]
+            SV1Utils["<b>Utilities:</b><br/>extractReasoningContent()<br/>getServerProps()<br/>getModels()"]
+        end
+        subgraph SV2["ModelsService"]
+            SV2List["<b>Listing:</b><br/>list()<br/>listRouter()"]
+            SV2LoadUnload["<b>Load/Unload:</b><br/>load()<br/>unload()"]
+            SV2Status["<b>Status:</b><br/>isModelLoaded()<br/>isModelLoading()"]
+        end
+        subgraph SV3["PropsService"]
+            SV3Fetch["<b>Fetching:</b><br/>fetch()<br/>fetchForModel()"]
+        end
+        subgraph SV4["DatabaseService"]
+            SV4Conv["<b>Conversations:</b><br/>createConversation()<br/>getConversation()<br/>getAllConversations()<br/>updateConversation()<br/>deleteConversation()"]
+            SV4Msg["<b>Messages:</b><br/>createMessageBranch()<br/>createRootMessage()<br/>getConversationMessages()<br/>updateMessage()<br/>deleteMessage()<br/>deleteMessageCascading()"]
+            SV4Node["<b>Navigation:</b><br/>updateCurrentNode()"]
+            SV4Import["<b>Import:</b><br/>importConversations()"]
+        end
+        subgraph SV5["ParameterSyncService"]
+            SV5Extract["<b>Extraction:</b><br/>extractServerDefaults()"]
+            SV5Merge["<b>Merging:</b><br/>mergeWithServerDefaults()"]
+            SV5Info["<b>Info:</b><br/>getParameterInfo()<br/>canSyncParameter()<br/>getSyncableParameterKeys()<br/>validateServerParameter()"]
+            SV5Diff["<b>Diff:</b><br/>createParameterDiff()"]
+        end
+    end
+
+    subgraph Storage["💾 Storage"]
+        ST1["IndexedDB"]
+        ST2["conversations"]
+        ST3["messages"]
+        ST5["LocalStorage"]
+        ST6["config"]
+        ST7["userOverrides"]
+    end
+
+    subgraph APIs["🌐 llama-server API"]
+        API1["/v1/chat/completions"]
+        API2["/props<br/>/props?model="]
+        API3["/models<br/>/models/load<br/>/models/unload"]
+        API4["/v1/models"]
+    end
+
+    %% Routes render Components
+    R1 --> C_Screen
+    R2 --> C_Screen
+    RL --> C_Sidebar
+
+    %% Component hierarchy
+    C_Screen --> C_Form & C_Messages & C_Settings
+    C_Messages --> C_Message
+    C_Message --> C_MessageUser
+    C_MessageUser --> C_MessageEditForm
+    C_MessageEditForm --> C_ModelsSelector
+    C_MessageEditForm --> C_Attach
+    C_Form --> C_ModelsSelector
+    C_Form --> C_Attach
+    C_Message --> C_Attach
+
+    %% Components use Hooks
+    C_Form --> H1
+    C_Message --> H1 & H2
+    C_MessageEditForm --> H1
+    C_Screen --> H2
+
+    %% Hooks use Stores
+    H1 --> S3 & S4
+    H2 --> S1 & S5
+
+    %% Components use Stores
+    C_Screen --> S1 & S2
+    C_Messages --> S2
+    C_Message --> S1 & S2 & S3
+    C_Form --> S1 & S3
+    C_Sidebar --> S2
+    C_ModelsSelector --> S3 & S4
+    C_Settings --> S5
+
+    %% Stores export Reactive State
+    S1 -. exports .-> ChatExports
+    S2 -. exports .-> ConvExports
+    S3 -. exports .-> ModelsExports
+    S4 -. exports .-> ServerExports
+    S5 -. exports .-> SettingsExports
+
+    %% Stores use Services
+    S1 --> SV1 & SV4
+    S2 --> SV4
+    S3 --> SV2 & SV3
+    S4 --> SV3
+    S5 --> SV5
+
+    %% Services to Storage
+    SV4 --> ST1
+    ST1 --> ST2 & ST3
+    SV5 --> ST5
+    ST5 --> ST6 & ST7
+
+    %% Services to APIs
+    SV1 --> API1
+    SV2 --> API3 & API4
+    SV3 --> API2
+
+    %% Styling
+    classDef routeStyle fill:#e1f5fe,stroke:#01579b,stroke-width:2px
+    classDef componentStyle fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+    classDef componentGroupStyle fill:#e1bee7,stroke:#7b1fa2,stroke-width:1px
+    classDef storeStyle fill:#fff3e0,stroke:#e65100,stroke-width:2px
+    classDef stateStyle fill:#ffe0b2,stroke:#e65100,stroke-width:1px
+    classDef methodStyle fill:#ffecb3,stroke:#e65100,stroke-width:1px
+    classDef reactiveStyle fill:#fffde7,stroke:#f9a825,stroke-width:1px
+    classDef serviceStyle fill:#e8f5e9,stroke:#2e7d32,stroke-width:2px
+    classDef serviceMStyle fill:#c8e6c9,stroke:#2e7d32,stroke-width:1px
+    classDef storageStyle fill:#fce4ec,stroke:#c2185b,stroke-width:2px
+    classDef apiStyle fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
+
+    class R1,R2,RL routeStyle
+    class C_Sidebar,C_Screen,C_Form,C_Messages,C_Message,C_MessageUser,C_MessageEditForm componentStyle
+    class C_ModelsSelector,C_Settings componentStyle
+    class C_Attach componentStyle
+    class H1,H2,H3 methodStyle
+    class LayoutComponents,ChatUIComponents componentGroupStyle
+    class Hooks storeStyle
+    class S1,S2,S3,S4,S5 storeStyle
+    class S1State,S2State,S3State,S4State,S5State stateStyle
+    class S1Msg,S1Regen,S1Edit,S1Stream,S1LoadState,S1ProcState,S1Error,S1Utils methodStyle
+    class S2Lifecycle,S2ConvCRUD,S2MsgMgmt,S2Nav,S2Modal,S2Export,S2Utils methodStyle
+    class S3Getters,S3Modal,S3Status,S3Fetch,S3Select,S3LoadUnload,S3Utils methodStyle
+    class S4Getters,S4Data,S4Utils methodStyle
+    class S5Lifecycle,S5Update,S5Reset,S5Sync,S5Utils methodStyle
+    class ChatExports,ConvExports,ModelsExports,ServerExports,SettingsExports reactiveStyle
+    class SV1,SV2,SV3,SV4,SV5 serviceStyle
+    class SV1Msg,SV1Stream,SV1Convert,SV1Utils serviceMStyle
+    class SV2List,SV2LoadUnload,SV2Status serviceMStyle
+    class SV3Fetch serviceMStyle
+    class SV4Conv,SV4Msg,SV4Node,SV4Import serviceMStyle
+    class SV5Extract,SV5Merge,SV5Info,SV5Diff serviceMStyle
+    class ST1,ST2,ST3,ST5,ST6,ST7 storageStyle
+    class API1,API2,API3,API4 apiStyle
+```
diff --git a/llama.cpp/tools/server/webui/docs/flows/chat-flow.md b/llama.cpp/tools/server/webui/docs/flows/chat-flow.md
new file mode 100644
index 0000000..05e1df3
--- /dev/null
+++ b/llama.cpp/tools/server/webui/docs/flows/chat-flow.md
@@ -0,0 +1,174 @@
+```mermaid
+sequenceDiagram
+    participant UI as 🧩 ChatForm / ChatMessage
+    participant chatStore as 🗄️ chatStore
+    participant convStore as 🗄️ conversationsStore
+    participant settingsStore as 🗄️ settingsStore
+    participant ChatSvc as ⚙️ ChatService
+    participant DbSvc as ⚙️ DatabaseService
+    participant API as 🌐 /v1/chat/completions
+
+    Note over chatStore: State:<br/>isLoading, currentResponse<br/>errorDialogState, activeProcessingState<br/>chatLoadingStates (Map)<br/>chatStreamingStates (Map)<br/>abortControllers (Map)<br/>processingStates (Map)
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: 💬 SEND MESSAGE
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>chatStore: sendMessage(content, extras)
+    activate chatStore
+
+    chatStore->>chatStore: setChatLoading(convId, true)
+    chatStore->>chatStore: clearChatStreaming(convId)
+
+    alt no active conversation
+        chatStore->>convStore: createConversation()
+        Note over convStore: → see conversations-flow.mmd
+    end
+
+    chatStore->>chatStore: addMessage("user", content, extras)
+    chatStore->>DbSvc: createMessageBranch(userMsg, parentId)
+    chatStore->>convStore: addMessageToActive(userMsg)
+    chatStore->>convStore: updateCurrentNode(userMsg.id)
+
+    chatStore->>chatStore: createAssistantMessage(userMsg.id)
+    chatStore->>DbSvc: createMessageBranch(assistantMsg, userMsg.id)
+    chatStore->>convStore: addMessageToActive(assistantMsg)
+
+    chatStore->>chatStore: streamChatCompletion(messages, assistantMsg)
+    deactivate chatStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: 🌊 STREAMING
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    activate chatStore
+    chatStore->>chatStore: startStreaming()
+    Note right of chatStore: isStreamingActive = true
+
+    chatStore->>chatStore: setActiveProcessingConversation(convId)
+    chatStore->>chatStore: getOrCreateAbortController(convId)
+    Note right of chatStore: abortControllers.set(convId, new AbortController())
+
+    chatStore->>chatStore: getApiOptions()
+    Note right of chatStore: Merge from settingsStore.config:<br/>temperature, max_tokens, top_p, etc.
+
+    chatStore->>ChatSvc: sendMessage(messages, options, signal)
+    activate ChatSvc
+
+    ChatSvc->>ChatSvc: convertMessageToChatData(messages)
+    Note right of ChatSvc: DatabaseMessage[] → ApiChatMessageData[]<br/>Process attachments (images, PDFs, audio)
+
+    ChatSvc->>API: POST /v1/chat/completions
+    Note right of API: {messages, model?, stream: true, ...params}
+
+    loop SSE chunks
+        API-->>ChatSvc: data: {"choices":[{"delta":{...}}]}
+        ChatSvc->>ChatSvc: parseSSEChunk(line)
+
+        alt content chunk
+            ChatSvc-->>chatStore: onChunk(content)
+            chatStore->>chatStore: setChatStreaming(convId, response, msgId)
+            Note right of chatStore: currentResponse = $state(accumulated)
+            chatStore->>convStore: updateMessageAtIndex(idx, {content})
+        end
+
+        alt reasoning chunk
+            ChatSvc-->>chatStore: onReasoningChunk(reasoning)
+            chatStore->>convStore: updateMessageAtIndex(idx, {thinking})
+        end
+
+        alt tool_calls chunk
+            ChatSvc-->>chatStore: onToolCallChunk(toolCalls)
+            chatStore->>convStore: updateMessageAtIndex(idx, {toolCalls})
+        end
+
+        alt model info
+            ChatSvc-->>chatStore: onModel(modelName)
+            chatStore->>chatStore: recordModel(modelName)
+            chatStore->>DbSvc: updateMessage(msgId, {model})
+        end
+
+        alt timings (during stream)
+            ChatSvc-->>chatStore: onTimings(timings, promptProgress)
+            chatStore->>chatStore: updateProcessingStateFromTimings()
+        end
+
+        chatStore-->>UI: reactive $state update
+    end
+
+    API-->>ChatSvc: data: [DONE]
+    ChatSvc-->>chatStore: onComplete(content, reasoning, timings, toolCalls)
+    deactivate ChatSvc
+
+    chatStore->>chatStore: stopStreaming()
+    chatStore->>DbSvc: updateMessage(msgId, {content, timings, model})
+    chatStore->>convStore: updateCurrentNode(msgId)
+    chatStore->>chatStore: setChatLoading(convId, false)
+    chatStore->>chatStore: clearChatStreaming(convId)
+    chatStore->>chatStore: clearProcessingState(convId)
+    deactivate chatStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: ⏹️ STOP GENERATION
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>chatStore: stopGeneration()
+    activate chatStore
+    chatStore->>chatStore: savePartialResponseIfNeeded(convId)
+    Note right of chatStore: Save currentResponse to DB if non-empty
+    chatStore->>chatStore: abortControllers.get(convId).abort()
+    Note right of chatStore: fetch throws AbortError → caught by isAbortError()
+    chatStore->>chatStore: stopStreaming()
+    chatStore->>chatStore: setChatLoading(convId, false)
+    chatStore->>chatStore: clearChatStreaming(convId)
+    chatStore->>chatStore: clearProcessingState(convId)
+    deactivate chatStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: 🔁 REGENERATE
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>chatStore: regenerateMessageWithBranching(msgId, model?)
+    activate chatStore
+    chatStore->>convStore: findMessageIndex(msgId)
+    chatStore->>chatStore: Get parent of target message
+    chatStore->>chatStore: createAssistantMessage(parentId)
+    chatStore->>DbSvc: createMessageBranch(newAssistantMsg, parentId)
+    chatStore->>convStore: refreshActiveMessages()
+    Note right of chatStore: Same streaming flow
+    chatStore->>chatStore: streamChatCompletion(...)
+    deactivate chatStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: ➡️ CONTINUE
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>chatStore: continueAssistantMessage(msgId)
+    activate chatStore
+    chatStore->>chatStore: Get existing content from message
+    chatStore->>chatStore: streamChatCompletion(..., existingContent)
+    Note right of chatStore: Appends to existing message content
+    deactivate chatStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: ✏️ EDIT USER MESSAGE
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>chatStore: editUserMessagePreserveResponses(msgId, newContent)
+    activate chatStore
+    chatStore->>chatStore: Get parent of target message
+    chatStore->>DbSvc: createMessageBranch(editedMsg, parentId)
+    chatStore->>convStore: refreshActiveMessages()
+    Note right of chatStore: Creates new branch, original preserved
+    deactivate chatStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: ❌ ERROR HANDLING
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Note over chatStore: On stream error (non-abort):
+    chatStore->>chatStore: showErrorDialog(type, message)
+    Note right of chatStore: errorDialogState = {type: 'timeout'|'server', message}
+    chatStore->>convStore: removeMessageAtIndex(failedMsgIdx)
+    chatStore->>DbSvc: deleteMessage(failedMsgId)
+```
diff --git a/llama.cpp/tools/server/webui/docs/flows/conversations-flow.md b/llama.cpp/tools/server/webui/docs/flows/conversations-flow.md
new file mode 100644
index 0000000..185ed16
--- /dev/null
+++ b/llama.cpp/tools/server/webui/docs/flows/conversations-flow.md
@@ -0,0 +1,155 @@
+```mermaid
+sequenceDiagram
+    participant UI as 🧩 ChatSidebar / ChatScreen
+    participant convStore as 🗄️ conversationsStore
+    participant chatStore as 🗄️ chatStore
+    participant DbSvc as ⚙️ DatabaseService
+    participant IDB as 💾 IndexedDB
+
+    Note over convStore: State:<br/>conversations: DatabaseConversation[]<br/>activeConversation: DatabaseConversation | null<br/>activeMessages: DatabaseMessage[]<br/>isInitialized: boolean<br/>usedModalities: $derived({vision, audio})
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,IDB: 🚀 INITIALIZATION
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Note over convStore: Auto-initialized in constructor (browser only)
+    convStore->>convStore: initialize()
+    activate convStore
+    convStore->>convStore: loadConversations()
+    convStore->>DbSvc: getAllConversations()
+    DbSvc->>IDB: SELECT * FROM conversations ORDER BY lastModified DESC
+    IDB-->>DbSvc: Conversation[]
+    DbSvc-->>convStore: conversations
+    convStore->>convStore: conversations = $state(data)
+    convStore->>convStore: isInitialized = true
+    deactivate convStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,IDB: ➕ CREATE CONVERSATION
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>convStore: createConversation(name?)
+    activate convStore
+    convStore->>DbSvc: createConversation(name || "New Chat")
+    DbSvc->>IDB: INSERT INTO conversations
+    IDB-->>DbSvc: conversation {id, name, lastModified, currNode: ""}
+    DbSvc-->>convStore: conversation
+    convStore->>convStore: conversations.unshift(conversation)
+    convStore->>convStore: activeConversation = $state(conversation)
+    convStore->>convStore: activeMessages = $state([])
+    deactivate convStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,IDB: 📂 LOAD CONVERSATION
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>convStore: loadConversation(convId)
+    activate convStore
+    convStore->>DbSvc: getConversation(convId)
+    DbSvc->>IDB: SELECT * FROM conversations WHERE id = ?
+    IDB-->>DbSvc: conversation
+    convStore->>convStore: activeConversation = $state(conversation)
+
+    convStore->>convStore: refreshActiveMessages()
+    convStore->>DbSvc: getConversationMessages(convId)
+    DbSvc->>IDB: SELECT * FROM messages WHERE convId = ?
+    IDB-->>DbSvc: allMessages[]
+    convStore->>convStore: filterByLeafNodeId(allMessages, currNode)
+    Note right of convStore: Filter to show only current branch path
+    convStore->>convStore: activeMessages = $state(filtered)
+
+    convStore->>chatStore: syncLoadingStateForChat(convId)
+    Note right of chatStore: Sync isLoading/currentResponse if streaming
+    deactivate convStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,IDB: 🌳 MESSAGE BRANCHING MODEL
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Note over IDB: Message Tree Structure:<br/>- Each message has parent (null for root)<br/>- Each message has children[] array<br/>- Conversation.currNode points to active leaf<br/>- filterByLeafNodeId() traverses from root to currNode
+
+    rect rgb(240, 240, 255)
+        Note over convStore: Example Branch Structure:
+        Note over convStore: root → user1 → assistant1 → user2 → assistant2a (currNode)<br/>                                    ↘ assistant2b (alt branch)
+    end
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,IDB: ↔️ BRANCH NAVIGATION
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>convStore: navigateToSibling(msgId, direction)
+    activate convStore
+    convStore->>convStore: Find message in activeMessages
+    convStore->>convStore: Get parent message
+    convStore->>convStore: Find sibling in parent.children[]
+    convStore->>convStore: findLeafNode(siblingId, allMessages)
+    Note right of convStore: Navigate to leaf of sibling branch
+    convStore->>convStore: updateCurrentNode(leafId)
+    convStore->>DbSvc: updateCurrentNode(convId, leafId)
+    DbSvc->>IDB: UPDATE conversations SET currNode = ?
+    convStore->>convStore: refreshActiveMessages()
+    deactivate convStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,IDB: 📝 UPDATE CONVERSATION
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>convStore: updateConversationName(convId, newName)
+    activate convStore
+    convStore->>DbSvc: updateConversation(convId, {name: newName})
+    DbSvc->>IDB: UPDATE conversations SET name = ?
+    convStore->>convStore: Update in conversations array
+    deactivate convStore
+
+    Note over convStore: Auto-title update (after first response):
+    convStore->>convStore: updateConversationTitleWithConfirmation()
+    convStore->>convStore: titleUpdateConfirmationCallback?()
+    Note right of convStore: Shows dialog if title would change
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,IDB: 🗑️ DELETE CONVERSATION
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>convStore: deleteConversation(convId)
+    activate convStore
+    convStore->>DbSvc: deleteConversation(convId)
+    DbSvc->>IDB: DELETE FROM conversations WHERE id = ?
+    DbSvc->>IDB: DELETE FROM messages WHERE convId = ?
+    convStore->>convStore: conversations.filter(c => c.id !== convId)
+    alt deleted active conversation
+        convStore->>convStore: clearActiveConversation()
+    end
+    deactivate convStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,IDB: 📊 MODALITY TRACKING
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Note over convStore: usedModalities = $derived.by(() => {<br/>  calculateModalitiesFromMessages(activeMessages)<br/>})
+
+    Note over convStore: Scans activeMessages for attachments:<br/>- IMAGE → vision: true<br/>- PDF (processedAsImages) → vision: true<br/>- AUDIO → audio: true
+
+    UI->>convStore: getModalitiesUpToMessage(msgId)
+    Note right of convStore: Used for regeneration validation<br/>Only checks messages BEFORE target
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,IDB: 📤 EXPORT / 📥 IMPORT
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>convStore: exportAllConversations()
+    activate convStore
+    convStore->>DbSvc: getAllConversations()
+    loop each conversation
+        convStore->>DbSvc: getConversationMessages(convId)
+    end
+    convStore->>convStore: triggerDownload(JSON blob)
+    deactivate convStore
+
+    UI->>convStore: importConversations(file)
+    activate convStore
+    convStore->>convStore: Parse JSON file
+    convStore->>DbSvc: importConversations(parsed)
+    DbSvc->>IDB: Bulk INSERT conversations + messages
+    convStore->>convStore: loadConversations()
+    deactivate convStore
+```
diff --git a/llama.cpp/tools/server/webui/docs/flows/data-flow-simplified-model-mode.md b/llama.cpp/tools/server/webui/docs/flows/data-flow-simplified-model-mode.md
new file mode 100644
index 0000000..07b3621
--- /dev/null
+++ b/llama.cpp/tools/server/webui/docs/flows/data-flow-simplified-model-mode.md
@@ -0,0 +1,45 @@
+```mermaid
+%% MODEL Mode Data Flow (single model)
+%% Detailed flows: ./flows/server-flow.mmd, ./flows/models-flow.mmd, ./flows/chat-flow.mmd
+
+sequenceDiagram
+    participant User as 👤 User
+    participant UI as 🧩 UI
+    participant Stores as 🗄️ Stores
+    participant DB as 💾 IndexedDB
+    participant API as 🌐 llama-server
+
+    Note over User,API: 🚀 Initialization (see: server-flow.mmd, models-flow.mmd)
+
+    UI->>Stores: initialize()
+    Stores->>DB: load conversations
+    Stores->>API: GET /props
+    API-->>Stores: server config + modalities
+    Stores->>API: GET /v1/models
+    API-->>Stores: single model (auto-selected)
+
+    Note over User,API: 💬 Chat Flow (see: chat-flow.mmd)
+
+    User->>UI: send message
+    UI->>Stores: sendMessage()
+    Stores->>DB: save user message
+    Stores->>API: POST /v1/chat/completions (stream)
+    loop streaming
+        API-->>Stores: SSE chunks
+        Stores-->>UI: reactive update
+    end
+    API-->>Stores: done + timings
+    Stores->>DB: save assistant message
+
+    Note over User,API: 🔁 Regenerate
+
+    User->>UI: regenerate
+    Stores->>DB: create message branch
+    Note right of Stores: same streaming flow
+
+    Note over User,API: ⏹️ Stop
+
+    User->>UI: stop
+    Stores->>Stores: abort stream
+    Stores->>DB: save partial response
+```
diff --git a/llama.cpp/tools/server/webui/docs/flows/data-flow-simplified-router-mode.md b/llama.cpp/tools/server/webui/docs/flows/data-flow-simplified-router-mode.md
new file mode 100644
index 0000000..bccacf5
--- /dev/null
+++ b/llama.cpp/tools/server/webui/docs/flows/data-flow-simplified-router-mode.md
@@ -0,0 +1,77 @@
+```mermaid
+%% ROUTER Mode Data Flow (multi-model)
+%% Detailed flows: ./flows/server-flow.mmd, ./flows/models-flow.mmd, ./flows/chat-flow.mmd
+
+sequenceDiagram
+    participant User as 👤 User
+    participant UI as 🧩 UI
+    participant Stores as 🗄️ Stores
+    participant DB as 💾 IndexedDB
+    participant API as 🌐 llama-server
+
+    Note over User,API: 🚀 Initialization (see: server-flow.mmd, models-flow.mmd)
+
+    UI->>Stores: initialize()
+    Stores->>DB: load conversations
+    Stores->>API: GET /props
+    API-->>Stores: {role: "router"}
+    Stores->>API: GET /v1/models
+    API-->>Stores: models[] with status (loaded/available)
+    loop each loaded model
+        Stores->>API: GET /props?model=X
+        API-->>Stores: modalities (vision/audio)
+    end
+
+    Note over User,API: 🔄 Model Selection (see: models-flow.mmd)
+
+    User->>UI: select model
+    alt model not loaded
+        Stores->>API: POST /models/load
+        loop poll status
+            Stores->>API: GET /v1/models
+            API-->>Stores: check if loaded
+        end
+        Stores->>API: GET /props?model=X
+        API-->>Stores: cache modalities
+    end
+    Stores->>Stores: validate modalities vs conversation
+    alt valid
+        Stores->>Stores: select model
+    else invalid
+        Stores->>API: POST /models/unload
+        UI->>User: show error toast
+    end
+
+    Note over User,API: 💬 Chat Flow (see: chat-flow.mmd)
+
+    User->>UI: send message
+    UI->>Stores: sendMessage()
+    Stores->>DB: save user message
+    Stores->>API: POST /v1/chat/completions {model: X}
+    Note right of API: router forwards to model
+    loop streaming
+        API-->>Stores: SSE chunks + model info
+        Stores-->>UI: reactive update
+    end
+    API-->>Stores: done + timings
+    Stores->>DB: save assistant message + model used
+
+    Note over User,API: 🔁 Regenerate (optional: different model)
+
+    User->>UI: regenerate
+    Stores->>Stores: validate modalities up to this message
+    Stores->>DB: create message branch
+    Note right of Stores: same streaming flow
+
+    Note over User,API: ⏹️ Stop
+
+    User->>UI: stop
+    Stores->>Stores: abort stream
+    Stores->>DB: save partial response
+
+    Note over User,API: 🗑️ LRU Unloading
+
+    Note right of API: Server auto-unloads LRU models<br/>when cache full
+    User->>UI: select unloaded model
+    Note right of Stores: triggers load flow again
+```
diff --git a/llama.cpp/tools/server/webui/docs/flows/database-flow.md b/llama.cpp/tools/server/webui/docs/flows/database-flow.md
new file mode 100644
index 0000000..50f8284
--- /dev/null
+++ b/llama.cpp/tools/server/webui/docs/flows/database-flow.md
@@ -0,0 +1,155 @@
+```mermaid
+sequenceDiagram
+    participant Store as 🗄️ Stores
+    participant DbSvc as ⚙️ DatabaseService
+    participant Dexie as 📦 Dexie ORM
+    participant IDB as 💾 IndexedDB
+
+    Note over DbSvc: Stateless service - all methods static<br/>Database: "LlamacppWebui"
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over Store,IDB: 📊 SCHEMA
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    rect rgb(240, 248, 255)
+        Note over IDB: conversations table:<br/>id (PK), lastModified, currNode, name
+    end
+
+    rect rgb(255, 248, 240)
+        Note over IDB: messages table:<br/>id (PK), convId (FK), type, role, timestamp,<br/>parent, children[], content, thinking,<br/>toolCalls, extra[], model, timings
+    end
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over Store,IDB: 💬 CONVERSATIONS CRUD
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Store->>DbSvc: createConversation(name)
+    activate DbSvc
+    DbSvc->>DbSvc: Generate UUID
+    DbSvc->>Dexie: db.conversations.add({id, name, lastModified, currNode: ""})
+    Dexie->>IDB: INSERT
+    IDB-->>Dexie: success
+    DbSvc-->>Store: DatabaseConversation
+    deactivate DbSvc
+
+    Store->>DbSvc: getConversation(convId)
+    DbSvc->>Dexie: db.conversations.get(convId)
+    Dexie->>IDB: SELECT WHERE id = ?
+    IDB-->>DbSvc: DatabaseConversation
+
+    Store->>DbSvc: getAllConversations()
+    DbSvc->>Dexie: db.conversations.orderBy('lastModified').reverse().toArray()
+    Dexie->>IDB: SELECT ORDER BY lastModified DESC
+    IDB-->>DbSvc: DatabaseConversation[]
+
+    Store->>DbSvc: updateConversation(convId, updates)
+    DbSvc->>Dexie: db.conversations.update(convId, {...updates, lastModified})
+    Dexie->>IDB: UPDATE
+
+    Store->>DbSvc: deleteConversation(convId)
+    activate DbSvc
+    DbSvc->>Dexie: db.conversations.delete(convId)
+    Dexie->>IDB: DELETE FROM conversations
+    DbSvc->>Dexie: db.messages.where('convId').equals(convId).delete()
+    Dexie->>IDB: DELETE FROM messages WHERE convId = ?
+    deactivate DbSvc
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over Store,IDB: 📝 MESSAGES CRUD
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Store->>DbSvc: createRootMessage(convId)
+    activate DbSvc
+    DbSvc->>DbSvc: Create root message {type: "root", parent: null}
+    DbSvc->>Dexie: db.messages.add(rootMsg)
+    Dexie->>IDB: INSERT
+    DbSvc-->>Store: rootMessageId
+    deactivate DbSvc
+
+    Store->>DbSvc: createMessageBranch(message, parentId)
+    activate DbSvc
+    DbSvc->>DbSvc: Generate UUID for new message
+    DbSvc->>Dexie: db.messages.add({...message, id, parent: parentId})
+    Dexie->>IDB: INSERT message
+
+    alt parentId exists
+        DbSvc->>Dexie: db.messages.get(parentId)
+        Dexie->>IDB: SELECT parent
+        DbSvc->>DbSvc: parent.children.push(newId)
+        DbSvc->>Dexie: db.messages.update(parentId, {children})
+        Dexie->>IDB: UPDATE parent.children
+    end
+
+    DbSvc->>Dexie: db.conversations.update(convId, {currNode: newId})
+    Dexie->>IDB: UPDATE conversation.currNode
+    DbSvc-->>Store: DatabaseMessage
+    deactivate DbSvc
+
+    Store->>DbSvc: getConversationMessages(convId)
+    DbSvc->>Dexie: db.messages.where('convId').equals(convId).toArray()
+    Dexie->>IDB: SELECT WHERE convId = ?
+    IDB-->>DbSvc: DatabaseMessage[]
+
+    Store->>DbSvc: updateMessage(msgId, updates)
+    DbSvc->>Dexie: db.messages.update(msgId, updates)
+    Dexie->>IDB: UPDATE
+
+    Store->>DbSvc: deleteMessage(msgId)
+    DbSvc->>Dexie: db.messages.delete(msgId)
+    Dexie->>IDB: DELETE
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over Store,IDB: 🌳 BRANCHING OPERATIONS
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Store->>DbSvc: updateCurrentNode(convId, nodeId)
+    DbSvc->>Dexie: db.conversations.update(convId, {currNode: nodeId, lastModified})
+    Dexie->>IDB: UPDATE
+
+    Store->>DbSvc: deleteMessageCascading(msgId)
+    activate DbSvc
+    DbSvc->>DbSvc: findDescendantMessages(msgId, allMessages)
+    Note right of DbSvc: Recursively find all children
+    loop each descendant
+        DbSvc->>Dexie: db.messages.delete(descendantId)
+        Dexie->>IDB: DELETE
+    end
+    DbSvc->>Dexie: db.messages.delete(msgId)
+    Dexie->>IDB: DELETE target message
+    deactivate DbSvc
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over Store,IDB: 📥 IMPORT
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Store->>DbSvc: importConversations(data)
+    activate DbSvc
+    loop each conversation in data
+        DbSvc->>DbSvc: Generate new UUIDs (avoid conflicts)
+        DbSvc->>Dexie: db.conversations.add(conversation)
+        Dexie->>IDB: INSERT conversation
+        loop each message
+            DbSvc->>Dexie: db.messages.add(message)
+            Dexie->>IDB: INSERT message
+        end
+    end
+    deactivate DbSvc
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over Store,IDB: 🔗 MESSAGE TREE UTILITIES
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Note over DbSvc: Used by stores (imported from utils):
+
+    rect rgb(240, 255, 240)
+        Note over DbSvc: filterByLeafNodeId(messages, leafId)<br/>→ Returns path from root to leaf<br/>→ Used to display current branch
+    end
+
+    rect rgb(240, 255, 240)
+        Note over DbSvc: findLeafNode(startId, messages)<br/>→ Traverse to deepest child<br/>→ Used for branch navigation
+    end
+
+    rect rgb(240, 255, 240)
+        Note over DbSvc: findDescendantMessages(msgId, messages)<br/>→ Find all children recursively<br/>→ Used for cascading deletes
+    end
+```
diff --git a/llama.cpp/tools/server/webui/docs/flows/models-flow.md b/llama.cpp/tools/server/webui/docs/flows/models-flow.md
new file mode 100644
index 0000000..c3031b7
--- /dev/null
+++ b/llama.cpp/tools/server/webui/docs/flows/models-flow.md
@@ -0,0 +1,181 @@
+```mermaid
+sequenceDiagram
+    participant UI as 🧩 ModelsSelector
+    participant Hooks as 🪝 useModelChangeValidation
+    participant modelsStore as 🗄️ modelsStore
+    participant serverStore as 🗄️ serverStore
+    participant convStore as 🗄️ conversationsStore
+    participant ModelsSvc as ⚙️ ModelsService
+    participant PropsSvc as ⚙️ PropsService
+    participant API as 🌐 llama-server
+
+    Note over modelsStore: State:<br/>models: ModelOption[]<br/>routerModels: ApiModelDataEntry[]<br/>selectedModelId, selectedModelName<br/>loading, updating, error<br/>modelLoadingStates (Map)<br/>modelPropsCache (Map)<br/>propsCacheVersion
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: 🚀 INITIALIZATION (MODEL mode)
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>modelsStore: fetch()
+    activate modelsStore
+    modelsStore->>modelsStore: loading = true
+
+    alt serverStore.props not loaded
+        modelsStore->>serverStore: fetch()
+        Note over serverStore: → see server-flow.mmd
+    end
+
+    modelsStore->>ModelsSvc: list()
+    ModelsSvc->>API: GET /v1/models
+    API-->>ModelsSvc: ApiModelListResponse {data: [model]}
+
+    modelsStore->>modelsStore: models = $state(mapped)
+    Note right of modelsStore: Map to ModelOption[]:<br/>{id, name, model, description, capabilities}
+
+    Note over modelsStore: MODEL mode: Get modalities from serverStore.props
+    modelsStore->>modelsStore: modelPropsCache.set(model.id, serverStore.props)
+    modelsStore->>modelsStore: models[0].modalities = props.modalities
+
+    modelsStore->>modelsStore: Auto-select single model
+    Note right of modelsStore: selectedModelId = models[0].id
+    modelsStore->>modelsStore: loading = false
+    deactivate modelsStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: 🚀 INITIALIZATION (ROUTER mode)
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>modelsStore: fetch()
+    activate modelsStore
+    modelsStore->>ModelsSvc: list()
+    ModelsSvc->>API: GET /v1/models
+    API-->>ModelsSvc: ApiModelListResponse
+    modelsStore->>modelsStore: models = $state(mapped)
+    deactivate modelsStore
+
+    Note over UI: After models loaded, layout triggers:
+    UI->>modelsStore: fetchRouterModels()
+    activate modelsStore
+    modelsStore->>ModelsSvc: listRouter()
+    ModelsSvc->>API: GET /v1/models
+    API-->>ModelsSvc: ApiRouterModelsListResponse
+    Note right of API: {data: [{id, status, path, in_cache}]}
+    modelsStore->>modelsStore: routerModels = $state(data)
+
+    modelsStore->>modelsStore: fetchModalitiesForLoadedModels()
+    loop each model where status === "loaded"
+        modelsStore->>PropsSvc: fetchForModel(modelId)
+        PropsSvc->>API: GET /props?model={modelId}
+        API-->>PropsSvc: ApiLlamaCppServerProps
+        modelsStore->>modelsStore: modelPropsCache.set(modelId, props)
+    end
+    modelsStore->>modelsStore: propsCacheVersion++
+    deactivate modelsStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: 🔄 MODEL SELECTION (ROUTER mode)
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>Hooks: useModelChangeValidation({getRequiredModalities, onSuccess?, onValidationFailure?})
+    Note over Hooks: Hook configured per-component:<br/>ChatForm: getRequiredModalities = usedModalities<br/>ChatMessage: getRequiredModalities = getModalitiesUpToMessage(msgId)
+
+    UI->>Hooks: handleModelChange(modelId, modelName)
+    activate Hooks
+    Hooks->>Hooks: previousSelectedModelId = modelsStore.selectedModelId
+    Hooks->>modelsStore: isModelLoaded(modelName)?
+
+    alt model NOT loaded
+        Hooks->>modelsStore: loadModel(modelName)
+        Note over modelsStore: → see LOAD MODEL section below
+    end
+
+    Note over Hooks: Always fetch props (from cache or API)
+    Hooks->>modelsStore: fetchModelProps(modelName)
+    modelsStore-->>Hooks: props
+
+    Hooks->>convStore: getRequiredModalities()
+    convStore-->>Hooks: {vision, audio}
+
+    Hooks->>Hooks: Validate: model.modalities ⊇ required?
+
+    alt validation PASSED
+        Hooks->>modelsStore: selectModelById(modelId)
+        Hooks-->>UI: return true
+    else validation FAILED
+        Hooks->>UI: toast.error("Model doesn't support required modalities")
+        alt model was just loaded
+            Hooks->>modelsStore: unloadModel(modelName)
+        end
+        alt onValidationFailure provided
+            Hooks->>modelsStore: selectModelById(previousSelectedModelId)
+        end
+        Hooks-->>UI: return false
+    end
+    deactivate Hooks
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: ⬆️ LOAD MODEL (ROUTER mode)
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    modelsStore->>modelsStore: loadModel(modelId)
+    activate modelsStore
+
+    alt already loaded
+        modelsStore-->>modelsStore: return (no-op)
+    end
+
+    modelsStore->>modelsStore: modelLoadingStates.set(modelId, true)
+    modelsStore->>ModelsSvc: load(modelId)
+    ModelsSvc->>API: POST /models/load {model: modelId}
+    API-->>ModelsSvc: {status: "loading"}
+
+    modelsStore->>modelsStore: pollForModelStatus(modelId, LOADED)
+    loop poll every 500ms (max 60 attempts)
+        modelsStore->>modelsStore: fetchRouterModels()
+        modelsStore->>ModelsSvc: listRouter()
+        ModelsSvc->>API: GET /v1/models
+        API-->>ModelsSvc: models[]
+        modelsStore->>modelsStore: getModelStatus(modelId)
+        alt status === LOADED
+            Note right of modelsStore: break loop
+        else status === LOADING
+            Note right of modelsStore: wait 500ms, continue
+        end
+    end
+
+    modelsStore->>modelsStore: updateModelModalities(modelId)
+    modelsStore->>PropsSvc: fetchForModel(modelId)
+    PropsSvc->>API: GET /props?model={modelId}
+    API-->>PropsSvc: props with modalities
+    modelsStore->>modelsStore: modelPropsCache.set(modelId, props)
+    modelsStore->>modelsStore: propsCacheVersion++
+
+    modelsStore->>modelsStore: modelLoadingStates.set(modelId, false)
+    deactivate modelsStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: ⬇️ UNLOAD MODEL (ROUTER mode)
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    modelsStore->>modelsStore: unloadModel(modelId)
+    activate modelsStore
+    modelsStore->>modelsStore: modelLoadingStates.set(modelId, true)
+    modelsStore->>ModelsSvc: unload(modelId)
+    ModelsSvc->>API: POST /models/unload {model: modelId}
+
+    modelsStore->>modelsStore: pollForModelStatus(modelId, UNLOADED)
+    loop poll until unloaded
+        modelsStore->>ModelsSvc: listRouter()
+        ModelsSvc->>API: GET /v1/models
+    end
+
+    modelsStore->>modelsStore: modelLoadingStates.set(modelId, false)
+    deactivate modelsStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: 📊 COMPUTED GETTERS
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Note over modelsStore: Getters:<br/>- selectedModel: ModelOption | null<br/>- loadedModelIds: string[] (from routerModels)<br/>- loadingModelIds: string[] (from modelLoadingStates)<br/>- singleModelName: string | null (MODEL mode only)
+
+    Note over modelsStore: Modality helpers:<br/>- getModelModalities(modelId): {vision, audio}<br/>- modelSupportsVision(modelId): boolean<br/>- modelSupportsAudio(modelId): boolean
+```
diff --git a/llama.cpp/tools/server/webui/docs/flows/server-flow.md b/llama.cpp/tools/server/webui/docs/flows/server-flow.md
new file mode 100644
index 0000000..d6a1611
--- /dev/null
+++ b/llama.cpp/tools/server/webui/docs/flows/server-flow.md
@@ -0,0 +1,76 @@
+```mermaid
+sequenceDiagram
+    participant UI as 🧩 +layout.svelte
+    participant serverStore as 🗄️ serverStore
+    participant PropsSvc as ⚙️ PropsService
+    participant API as 🌐 llama-server
+
+    Note over serverStore: State:<br/>props: ApiLlamaCppServerProps | null<br/>loading, error<br/>role: ServerRole | null (MODEL | ROUTER)<br/>fetchPromise (deduplication)
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: 🚀 INITIALIZATION
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>serverStore: fetch()
+    activate serverStore
+
+    alt fetchPromise exists (already fetching)
+        serverStore-->>UI: return fetchPromise
+        Note right of serverStore: Deduplicate concurrent calls
+    end
+
+    serverStore->>serverStore: loading = true
+    serverStore->>serverStore: fetchPromise = new Promise()
+
+    serverStore->>PropsSvc: fetch()
+    PropsSvc->>API: GET /props
+    API-->>PropsSvc: ApiLlamaCppServerProps
+    Note right of API: {role, model_path, model_alias,<br/>modalities, default_generation_settings, ...}
+
+    PropsSvc-->>serverStore: props
+    serverStore->>serverStore: props = $state(data)
+
+    serverStore->>serverStore: detectRole(props)
+    Note right of serverStore: role = props.role === "router"<br/>  ? ServerRole.ROUTER<br/>  : ServerRole.MODEL
+
+    serverStore->>serverStore: loading = false
+    serverStore->>serverStore: fetchPromise = null
+    deactivate serverStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: 📊 COMPUTED GETTERS
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Note over serverStore: Getters from props:
+
+    rect rgb(240, 255, 240)
+        Note over serverStore: defaultParams<br/>→ props.default_generation_settings.params<br/>(temperature, top_p, top_k, etc.)
+    end
+
+    rect rgb(240, 255, 240)
+        Note over serverStore: contextSize<br/>→ props.default_generation_settings.n_ctx
+    end
+
+    rect rgb(255, 240, 240)
+        Note over serverStore: isRouterMode<br/>→ role === ServerRole.ROUTER
+    end
+
+    rect rgb(255, 240, 240)
+        Note over serverStore: isModelMode<br/>→ role === ServerRole.MODEL
+    end
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: 🔗 RELATIONSHIPS
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Note over serverStore: Used by:
+    Note right of serverStore: - modelsStore: role detection, MODEL mode modalities<br/>- settingsStore: syncWithServerDefaults (defaultParams)<br/>- chatStore: contextSize for processing state<br/>- UI components: isRouterMode for conditional rendering
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,API: ❌ ERROR HANDLING
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Note over serverStore: getErrorMessage(): string | null<br/>Returns formatted error for UI display
+
+    Note over serverStore: clear(): void<br/>Resets all state (props, error, loading, role)
+```
diff --git a/llama.cpp/tools/server/webui/docs/flows/settings-flow.md b/llama.cpp/tools/server/webui/docs/flows/settings-flow.md
new file mode 100644
index 0000000..578e01e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/docs/flows/settings-flow.md
@@ -0,0 +1,144 @@
+```mermaid
+sequenceDiagram
+    participant UI as 🧩 ChatSettings
+    participant settingsStore as 🗄️ settingsStore
+    participant serverStore as 🗄️ serverStore
+    participant ParamSvc as ⚙️ ParameterSyncService
+    participant LS as 💾 LocalStorage
+
+    Note over settingsStore: State:<br/>config: SettingsConfigType<br/>theme: string ("auto" | "light" | "dark")<br/>isInitialized: boolean<br/>userOverrides: Set&lt;string&gt;
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,LS: 🚀 INITIALIZATION
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Note over settingsStore: Auto-initialized in constructor (browser only)
+    settingsStore->>settingsStore: initialize()
+    activate settingsStore
+
+    settingsStore->>settingsStore: loadConfig()
+    settingsStore->>LS: get("llama-config")
+    LS-->>settingsStore: StoredConfig | null
+
+    alt config exists
+        settingsStore->>settingsStore: Merge with SETTING_CONFIG_DEFAULT
+        Note right of settingsStore: Fill missing keys with defaults
+    else no config
+        settingsStore->>settingsStore: config = SETTING_CONFIG_DEFAULT
+    end
+
+    settingsStore->>LS: get("llama-userOverrides")
+    LS-->>settingsStore: string[] | null
+    settingsStore->>settingsStore: userOverrides = new Set(data)
+
+    settingsStore->>settingsStore: loadTheme()
+    settingsStore->>LS: get("llama-theme")
+    LS-->>settingsStore: theme | "auto"
+
+    settingsStore->>settingsStore: isInitialized = true
+    deactivate settingsStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,LS: 🔄 SYNC WITH SERVER DEFAULTS
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Note over UI: Triggered from +layout.svelte when serverStore.props loaded
+    UI->>settingsStore: syncWithServerDefaults()
+    activate settingsStore
+
+    settingsStore->>serverStore: defaultParams
+    serverStore-->>settingsStore: {temperature, top_p, top_k, ...}
+
+    settingsStore->>ParamSvc: extractServerDefaults(defaultParams)
+    ParamSvc-->>settingsStore: Record<string, value>
+
+    settingsStore->>ParamSvc: mergeWithServerDefaults(config, serverDefaults)
+    Note right of ParamSvc: For each syncable parameter:<br/>- If NOT in userOverrides → use server default<br/>- If in userOverrides → keep user value
+    ParamSvc-->>settingsStore: mergedConfig
+
+    settingsStore->>settingsStore: config = mergedConfig
+    settingsStore->>settingsStore: saveConfig()
+    deactivate settingsStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,LS: ⚙️ UPDATE CONFIG
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>settingsStore: updateConfig(key, value)
+    activate settingsStore
+    settingsStore->>settingsStore: config[key] = value
+    settingsStore->>settingsStore: userOverrides.add(key)
+    Note right of settingsStore: Mark as user-modified (won't be overwritten by server)
+    settingsStore->>settingsStore: saveConfig()
+    settingsStore->>LS: set("llama-config", config)
+    settingsStore->>LS: set("llama-userOverrides", [...userOverrides])
+    deactivate settingsStore
+
+    UI->>settingsStore: updateMultipleConfig({key1: val1, key2: val2})
+    activate settingsStore
+    Note right of settingsStore: Batch update, single save
+    settingsStore->>settingsStore: For each key: config[key] = value
+    settingsStore->>settingsStore: For each key: userOverrides.add(key)
+    settingsStore->>settingsStore: saveConfig()
+    deactivate settingsStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,LS: 🔄 RESET
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>settingsStore: resetConfig()
+    activate settingsStore
+    settingsStore->>settingsStore: config = SETTING_CONFIG_DEFAULT
+    settingsStore->>settingsStore: userOverrides.clear()
+    settingsStore->>settingsStore: syncWithServerDefaults()
+    Note right of settingsStore: Apply server defaults for syncable params
+    settingsStore->>settingsStore: saveConfig()
+    deactivate settingsStore
+
+    UI->>settingsStore: resetParameterToServerDefault(key)
+    activate settingsStore
+    settingsStore->>settingsStore: userOverrides.delete(key)
+    settingsStore->>serverStore: defaultParams[key]
+    settingsStore->>settingsStore: config[key] = serverDefault
+    settingsStore->>settingsStore: saveConfig()
+    deactivate settingsStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,LS: 🎨 THEME
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>settingsStore: updateTheme(newTheme)
+    activate settingsStore
+    settingsStore->>settingsStore: theme = newTheme
+    settingsStore->>settingsStore: saveTheme()
+    settingsStore->>LS: set("llama-theme", theme)
+    deactivate settingsStore
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,LS: 📊 PARAMETER INFO
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    UI->>settingsStore: getParameterInfo(key)
+    settingsStore->>ParamSvc: getParameterInfo(key, config, serverDefaults, userOverrides)
+    ParamSvc-->>settingsStore: ParameterInfo
+    Note right of ParamSvc: {<br/>  currentValue,<br/>  serverDefault,<br/>  isUserOverride: boolean,<br/>  canSync: boolean,<br/>  isDifferentFromServer: boolean<br/>}
+
+    UI->>settingsStore: getParameterDiff()
+    settingsStore->>ParamSvc: createParameterDiff(config, serverDefaults, userOverrides)
+    ParamSvc-->>settingsStore: ParameterDiff[]
+    Note right of ParamSvc: Array of parameters where user != server
+
+    %% ═══════════════════════════════════════════════════════════════════════════
+    Note over UI,LS: 📋 CONFIG CATEGORIES
+    %% ═══════════════════════════════════════════════════════════════════════════
+
+    Note over settingsStore: Syncable with server (from /props):
+    rect rgb(240, 255, 240)
+        Note over settingsStore: temperature, top_p, top_k, min_p<br/>repeat_penalty, presence_penalty, frequency_penalty<br/>dynatemp_range, dynatemp_exponent<br/>typ_p, xtc_probability, xtc_threshold<br/>dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n
+    end
+
+    Note over settingsStore: UI-only (not synced):
+    rect rgb(255, 240, 240)
+        Note over settingsStore: systemMessage, custom (JSON)<br/>showStatistics, enableContinueGeneration<br/>autoMicOnEmpty, disableAutoScroll<br/>apiKey, pdfAsImage, disableReasoningFormat
+    end
+```
diff --git a/llama.cpp/tools/server/webui/eslint.config.js b/llama.cpp/tools/server/webui/eslint.config.js
new file mode 100644
index 0000000..5baea57
--- /dev/null
+++ b/llama.cpp/tools/server/webui/eslint.config.js
@@ -0,0 +1,49 @@
+// For more info, see https://github.com/storybookjs/eslint-plugin-storybook#configuration-flat-config-format
+import storybook from 'eslint-plugin-storybook';
+
+import prettier from 'eslint-config-prettier';
+import { includeIgnoreFile } from '@eslint/compat';
+import js from '@eslint/js';
+import svelte from 'eslint-plugin-svelte';
+import globals from 'globals';
+import { fileURLToPath } from 'node:url';
+import ts from 'typescript-eslint';
+import svelteConfig from './svelte.config.js';
+
+const gitignorePath = fileURLToPath(new URL('./.gitignore', import.meta.url));
+
+export default ts.config(
+	includeIgnoreFile(gitignorePath),
+	js.configs.recommended,
+	...ts.configs.recommended,
+	...svelte.configs.recommended,
+	prettier,
+	...svelte.configs.prettier,
+	{
+		languageOptions: {
+			globals: { ...globals.browser, ...globals.node }
+		},
+		rules: {
+			// typescript-eslint strongly recommend that you do not use the no-undef lint rule on TypeScript projects.
+			// see: https://typescript-eslint.io/troubleshooting/faqs/eslint/#i-get-errors-from-the-no-undef-rule-about-global-variables-not-being-defined-even-though-there-are-no-typescript-errors
+			'no-undef': 'off',
+			'svelte/no-at-html-tags': 'off'
+		}
+	},
+	{
+		files: ['**/*.svelte', '**/*.svelte.ts', '**/*.svelte.js'],
+		languageOptions: {
+			parserOptions: {
+				projectService: true,
+				extraFileExtensions: ['.svelte'],
+				parser: ts.parser,
+				svelteConfig
+			}
+		}
+	},
+	{
+		// Exclude Storybook files from main ESLint rules
+		ignores: ['.storybook/**/*']
+	},
+	storybook.configs['flat/recommended']
+);
diff --git a/llama.cpp/tools/server/webui/package-lock.json b/llama.cpp/tools/server/webui/package-lock.json
new file mode 100644
index 0000000..6834416
--- /dev/null
+++ b/llama.cpp/tools/server/webui/package-lock.json
@@ -0,0 +1,9343 @@
+{
+	"name": "webui",
+	"version": "1.0.0",
+	"lockfileVersion": 3,
+	"requires": true,
+	"packages": {
+		"": {
+			"name": "webui",
+			"version": "1.0.0",
+			"dependencies": {
+				"highlight.js": "^11.11.1",
+				"mode-watcher": "^1.1.0",
+				"pdfjs-dist": "^5.4.54",
+				"rehype-highlight": "^7.0.2",
+				"rehype-stringify": "^10.0.1",
+				"remark": "^15.0.1",
+				"remark-breaks": "^4.0.0",
+				"remark-gfm": "^4.0.1",
+				"remark-html": "^16.0.1",
+				"remark-rehype": "^11.1.2",
+				"svelte-sonner": "^1.0.5",
+				"unist-util-visit": "^5.0.0"
+			},
+			"devDependencies": {
+				"@chromatic-com/storybook": "^4.1.2",
+				"@eslint/compat": "^1.2.5",
+				"@eslint/js": "^9.18.0",
+				"@internationalized/date": "^3.10.1",
+				"@lucide/svelte": "^0.515.0",
+				"@playwright/test": "^1.49.1",
+				"@storybook/addon-a11y": "^10.0.7",
+				"@storybook/addon-docs": "^10.0.7",
+				"@storybook/addon-svelte-csf": "^5.0.10",
+				"@storybook/addon-vitest": "^10.0.7",
+				"@storybook/sveltekit": "^10.0.7",
+				"@sveltejs/adapter-static": "^3.0.10",
+				"@sveltejs/kit": "^2.48.4",
+				"@sveltejs/vite-plugin-svelte": "^6.2.1",
+				"@tailwindcss/forms": "^0.5.9",
+				"@tailwindcss/typography": "^0.5.15",
+				"@tailwindcss/vite": "^4.0.0",
+				"@types/node": "^22",
+				"@vitest/browser": "^3.2.3",
+				"bits-ui": "^2.14.4",
+				"clsx": "^2.1.1",
+				"dexie": "^4.0.11",
+				"eslint": "^9.18.0",
+				"eslint-config-prettier": "^10.0.1",
+				"eslint-plugin-storybook": "^10.0.7",
+				"eslint-plugin-svelte": "^3.0.0",
+				"fflate": "^0.8.2",
+				"globals": "^16.0.0",
+				"http-server": "^14.1.1",
+				"mdast": "^3.0.0",
+				"mdsvex": "^0.12.3",
+				"playwright": "^1.56.1",
+				"prettier": "^3.4.2",
+				"prettier-plugin-svelte": "^3.3.3",
+				"prettier-plugin-tailwindcss": "^0.6.11",
+				"rehype-katex": "^7.0.1",
+				"remark-math": "^6.0.0",
+				"sass": "^1.93.3",
+				"storybook": "^10.0.7",
+				"svelte": "^5.38.2",
+				"svelte-check": "^4.0.0",
+				"tailwind-merge": "^3.3.1",
+				"tailwind-variants": "^3.2.2",
+				"tailwindcss": "^4.0.0",
+				"tw-animate-css": "^1.3.5",
+				"typescript": "^5.0.0",
+				"typescript-eslint": "^8.20.0",
+				"unified": "^11.0.5",
+				"uuid": "^13.0.0",
+				"vite": "^7.2.2",
+				"vite-plugin-devtools-json": "^0.2.0",
+				"vitest": "^3.2.3",
+				"vitest-browser-svelte": "^0.1.0"
+			}
+		},
+		"node_modules/@adobe/css-tools": {
+			"version": "4.4.3",
+			"resolved": "https://registry.npmjs.org/@adobe/css-tools/-/css-tools-4.4.3.tgz",
+			"integrity": "sha512-VQKMkwriZbaOgVCby1UDY/LDk5fIjhQicCvVPFqfe+69fWaPWydbWJ3wRt59/YzIwda1I81loas3oCoHxnqvdA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@ampproject/remapping": {
+			"version": "2.3.0",
+			"resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz",
+			"integrity": "sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"@jridgewell/gen-mapping": "^0.3.5",
+				"@jridgewell/trace-mapping": "^0.3.24"
+			},
+			"engines": {
+				"node": ">=6.0.0"
+			}
+		},
+		"node_modules/@babel/code-frame": {
+			"version": "7.27.1",
+			"resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz",
+			"integrity": "sha512-cjQ7ZlQ0Mv3b47hABuTevyTuYN4i+loJKGeV9flcCgIK37cCXRh+L1bd3iBHlynerhQ7BhCkn2BPbQUL+rGqFg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@babel/helper-validator-identifier": "^7.27.1",
+				"js-tokens": "^4.0.0",
+				"picocolors": "^1.1.1"
+			},
+			"engines": {
+				"node": ">=6.9.0"
+			}
+		},
+		"node_modules/@babel/helper-validator-identifier": {
+			"version": "7.27.1",
+			"resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.27.1.tgz",
+			"integrity": "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6.9.0"
+			}
+		},
+		"node_modules/@babel/runtime": {
+			"version": "7.27.6",
+			"resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.6.tgz",
+			"integrity": "sha512-vbavdySgbTTrmFE+EsiqUTzlOr5bzlnJtUv9PynGCAKvfQqjIXbvFdumPM/GxMDfyuGMJaJAU6TO4zc1Jf1i8Q==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6.9.0"
+			}
+		},
+		"node_modules/@chromatic-com/storybook": {
+			"version": "4.1.2",
+			"resolved": "https://registry.npmjs.org/@chromatic-com/storybook/-/storybook-4.1.2.tgz",
+			"integrity": "sha512-QAWGtHwib0qsP5CcO64aJCF75zpFgpKK3jNpxILzQiPK3sVo4EmnVGJVdwcZWpWrGdH8E4YkncGoitw4EXzKMg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@neoconfetti/react": "^1.0.0",
+				"chromatic": "^12.0.0",
+				"filesize": "^10.0.12",
+				"jsonfile": "^6.1.0",
+				"strip-ansi": "^7.1.0"
+			},
+			"engines": {
+				"node": ">=20.0.0",
+				"yarn": ">=1.22.18"
+			},
+			"peerDependencies": {
+				"storybook": "^0.0.0-0 || ^9.0.0 || ^9.1.0-0 || ^9.2.0-0 || ^10.0.0-0 || ^10.1.0-0 || ^10.2.0-0 || ^10.3.0-0"
+			}
+		},
+		"node_modules/@esbuild/aix-ppc64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.8.tgz",
+			"integrity": "sha512-urAvrUedIqEiFR3FYSLTWQgLu5tb+m0qZw0NBEasUeo6wuqatkMDaRT+1uABiGXEu5vqgPd7FGE1BhsAIy9QVA==",
+			"cpu": [
+				"ppc64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"aix"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/android-arm": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.25.8.tgz",
+			"integrity": "sha512-RONsAvGCz5oWyePVnLdZY/HHwA++nxYWIX1atInlaW6SEkwq6XkP3+cb825EUcRs5Vss/lGh/2YxAb5xqc07Uw==",
+			"cpu": [
+				"arm"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"android"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/android-arm64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.25.8.tgz",
+			"integrity": "sha512-OD3p7LYzWpLhZEyATcTSJ67qB5D+20vbtr6vHlHWSQYhKtzUYrETuWThmzFpZtFsBIxRvhO07+UgVA9m0i/O1w==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"android"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/android-x64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.25.8.tgz",
+			"integrity": "sha512-yJAVPklM5+4+9dTeKwHOaA+LQkmrKFX96BM0A/2zQrbS6ENCmxc4OVoBs5dPkCCak2roAD+jKCdnmOqKszPkjA==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"android"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/darwin-arm64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.25.8.tgz",
+			"integrity": "sha512-Jw0mxgIaYX6R8ODrdkLLPwBqHTtYHJSmzzd+QeytSugzQ0Vg4c5rDky5VgkoowbZQahCbsv1rT1KW72MPIkevw==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/darwin-x64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.25.8.tgz",
+			"integrity": "sha512-Vh2gLxxHnuoQ+GjPNvDSDRpoBCUzY4Pu0kBqMBDlK4fuWbKgGtmDIeEC081xi26PPjn+1tct+Bh8FjyLlw1Zlg==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/freebsd-arm64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.8.tgz",
+			"integrity": "sha512-YPJ7hDQ9DnNe5vxOm6jaie9QsTwcKedPvizTVlqWG9GBSq+BuyWEDazlGaDTC5NGU4QJd666V0yqCBL2oWKPfA==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"freebsd"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/freebsd-x64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.25.8.tgz",
+			"integrity": "sha512-MmaEXxQRdXNFsRN/KcIimLnSJrk2r5H8v+WVafRWz5xdSVmWLoITZQXcgehI2ZE6gioE6HirAEToM/RvFBeuhw==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"freebsd"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-arm": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.25.8.tgz",
+			"integrity": "sha512-FuzEP9BixzZohl1kLf76KEVOsxtIBFwCaLupVuk4eFVnOZfU+Wsn+x5Ryam7nILV2pkq2TqQM9EZPsOBuMC+kg==",
+			"cpu": [
+				"arm"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-arm64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.25.8.tgz",
+			"integrity": "sha512-WIgg00ARWv/uYLU7lsuDK00d/hHSfES5BzdWAdAig1ioV5kaFNrtK8EqGcUBJhYqotlUByUKz5Qo6u8tt7iD/w==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-ia32": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.25.8.tgz",
+			"integrity": "sha512-A1D9YzRX1i+1AJZuFFUMP1E9fMaYY+GnSQil9Tlw05utlE86EKTUA7RjwHDkEitmLYiFsRd9HwKBPEftNdBfjg==",
+			"cpu": [
+				"ia32"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-loong64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.25.8.tgz",
+			"integrity": "sha512-O7k1J/dwHkY1RMVvglFHl1HzutGEFFZ3kNiDMSOyUrB7WcoHGf96Sh+64nTRT26l3GMbCW01Ekh/ThKM5iI7hQ==",
+			"cpu": [
+				"loong64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-mips64el": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.25.8.tgz",
+			"integrity": "sha512-uv+dqfRazte3BzfMp8PAQXmdGHQt2oC/y2ovwpTteqrMx2lwaksiFZ/bdkXJC19ttTvNXBuWH53zy/aTj1FgGw==",
+			"cpu": [
+				"mips64el"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-ppc64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.25.8.tgz",
+			"integrity": "sha512-GyG0KcMi1GBavP5JgAkkstMGyMholMDybAf8wF5A70CALlDM2p/f7YFE7H92eDeH/VBtFJA5MT4nRPDGg4JuzQ==",
+			"cpu": [
+				"ppc64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-riscv64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.25.8.tgz",
+			"integrity": "sha512-rAqDYFv3yzMrq7GIcen3XP7TUEG/4LK86LUPMIz6RT8A6pRIDn0sDcvjudVZBiiTcZCY9y2SgYX2lgK3AF+1eg==",
+			"cpu": [
+				"riscv64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-s390x": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.25.8.tgz",
+			"integrity": "sha512-Xutvh6VjlbcHpsIIbwY8GVRbwoviWT19tFhgdA7DlenLGC/mbc3lBoVb7jxj9Z+eyGqvcnSyIltYUrkKzWqSvg==",
+			"cpu": [
+				"s390x"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/linux-x64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.25.8.tgz",
+			"integrity": "sha512-ASFQhgY4ElXh3nDcOMTkQero4b1lgubskNlhIfJrsH5OKZXDpUAKBlNS0Kx81jwOBp+HCeZqmoJuihTv57/jvQ==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/netbsd-arm64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.8.tgz",
+			"integrity": "sha512-d1KfruIeohqAi6SA+gENMuObDbEjn22olAR7egqnkCD9DGBG0wsEARotkLgXDu6c4ncgWTZJtN5vcgxzWRMzcw==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"netbsd"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/netbsd-x64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.25.8.tgz",
+			"integrity": "sha512-nVDCkrvx2ua+XQNyfrujIG38+YGyuy2Ru9kKVNyh5jAys6n+l44tTtToqHjino2My8VAY6Lw9H7RI73XFi66Cg==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"netbsd"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/openbsd-arm64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.8.tgz",
+			"integrity": "sha512-j8HgrDuSJFAujkivSMSfPQSAa5Fxbvk4rgNAS5i3K+r8s1X0p1uOO2Hl2xNsGFppOeHOLAVgYwDVlmxhq5h+SQ==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"openbsd"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/openbsd-x64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.25.8.tgz",
+			"integrity": "sha512-1h8MUAwa0VhNCDp6Af0HToI2TJFAn1uqT9Al6DJVzdIBAd21m/G0Yfc77KDM3uF3T/YaOgQq3qTJHPbTOInaIQ==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"openbsd"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/openharmony-arm64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.25.8.tgz",
+			"integrity": "sha512-r2nVa5SIK9tSWd0kJd9HCffnDHKchTGikb//9c7HX+r+wHYCpQrSgxhlY6KWV1nFo1l4KFbsMlHk+L6fekLsUg==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"openharmony"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/sunos-x64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.25.8.tgz",
+			"integrity": "sha512-zUlaP2S12YhQ2UzUfcCuMDHQFJyKABkAjvO5YSndMiIkMimPmxA+BYSBikWgsRpvyxuRnow4nS5NPnf9fpv41w==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"sunos"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/win32-arm64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.25.8.tgz",
+			"integrity": "sha512-YEGFFWESlPva8hGL+zvj2z/SaK+pH0SwOM0Nc/d+rVnW7GSTFlLBGzZkuSU9kFIGIo8q9X3ucpZhu8PDN5A2sQ==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/win32-ia32": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.25.8.tgz",
+			"integrity": "sha512-hiGgGC6KZ5LZz58OL/+qVVoZiuZlUYlYHNAmczOm7bs2oE1XriPFi5ZHHrS8ACpV5EjySrnoCKmcbQMN+ojnHg==",
+			"cpu": [
+				"ia32"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@esbuild/win32-x64": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.25.8.tgz",
+			"integrity": "sha512-cn3Yr7+OaaZq1c+2pe+8yxC8E144SReCQjN6/2ynubzYjvyqZjTXfQJpAcQpsdJq3My7XADANiYGHoFC69pLQw==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@eslint-community/eslint-utils": {
+			"version": "4.7.0",
+			"resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.7.0.tgz",
+			"integrity": "sha512-dyybb3AcajC7uha6CvhdVRJqaKyn7w2YKqKyAN37NKYgZT36w+iRb0Dymmc5qEJ549c/S31cMMSFd75bteCpCw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"eslint-visitor-keys": "^3.4.3"
+			},
+			"engines": {
+				"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			},
+			"peerDependencies": {
+				"eslint": "^6.0.0 || ^7.0.0 || >=8.0.0"
+			}
+		},
+		"node_modules/@eslint-community/eslint-utils/node_modules/eslint-visitor-keys": {
+			"version": "3.4.3",
+			"resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz",
+			"integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/@eslint-community/regexpp": {
+			"version": "4.12.1",
+			"resolved": "https://registry.npmjs.org/@eslint-community/regexpp/-/regexpp-4.12.1.tgz",
+			"integrity": "sha512-CCZCDJuduB9OUkFkY2IgppNZMi2lBQgD2qzwXkEia16cge2pijY/aXi96CJMquDMn3nJdlPV1A5KrJEXwfLNzQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": "^12.0.0 || ^14.0.0 || >=16.0.0"
+			}
+		},
+		"node_modules/@eslint/compat": {
+			"version": "1.3.1",
+			"resolved": "https://registry.npmjs.org/@eslint/compat/-/compat-1.3.1.tgz",
+			"integrity": "sha512-k8MHony59I5EPic6EQTCNOuPoVBnoYXkP+20xvwFjN7t0qI3ImyvyBgg+hIVPwC8JaxVjjUZld+cLfBLFDLucg==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"peerDependencies": {
+				"eslint": "^8.40 || 9"
+			},
+			"peerDependenciesMeta": {
+				"eslint": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/@eslint/config-array": {
+			"version": "0.21.0",
+			"resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.21.0.tgz",
+			"integrity": "sha512-ENIdc4iLu0d93HeYirvKmrzshzofPw6VkZRKQGe9Nv46ZnWUzcF1xV01dcvEg/1wXUR61OmmlSfyeyO7EvjLxQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"@eslint/object-schema": "^2.1.6",
+				"debug": "^4.3.1",
+				"minimatch": "^3.1.2"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			}
+		},
+		"node_modules/@eslint/config-helpers": {
+			"version": "0.3.0",
+			"resolved": "https://registry.npmjs.org/@eslint/config-helpers/-/config-helpers-0.3.0.tgz",
+			"integrity": "sha512-ViuymvFmcJi04qdZeDc2whTHryouGcDlaxPqarTD0ZE10ISpxGUVZGZDx4w01upyIynL3iu6IXH2bS1NhclQMw==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			}
+		},
+		"node_modules/@eslint/core": {
+			"version": "0.15.2",
+			"resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.15.2.tgz",
+			"integrity": "sha512-78Md3/Rrxh83gCxoUc0EiciuOHsIITzLy53m3d9UyiW8y9Dj2D29FeETqyKA+BRK76tnTp6RXWb3pCay8Oyomg==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"@types/json-schema": "^7.0.15"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			}
+		},
+		"node_modules/@eslint/eslintrc": {
+			"version": "3.3.1",
+			"resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.3.1.tgz",
+			"integrity": "sha512-gtF186CXhIl1p4pJNGZw8Yc6RlshoePRvE0X91oPGb3vZ8pM3qOS9W9NGPat9LziaBV7XrJWGylNQXkGcnM3IQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ajv": "^6.12.4",
+				"debug": "^4.3.2",
+				"espree": "^10.0.1",
+				"globals": "^14.0.0",
+				"ignore": "^5.2.0",
+				"import-fresh": "^3.2.1",
+				"js-yaml": "^4.1.0",
+				"minimatch": "^3.1.2",
+				"strip-json-comments": "^3.1.1"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/@eslint/eslintrc/node_modules/globals": {
+			"version": "14.0.0",
+			"resolved": "https://registry.npmjs.org/globals/-/globals-14.0.0.tgz",
+			"integrity": "sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=18"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/@eslint/js": {
+			"version": "9.31.0",
+			"resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.31.0.tgz",
+			"integrity": "sha512-LOm5OVt7D4qiKCqoiPbA7LWmI+tbw1VbTUowBcUMgQSuM6poJufkFkYDcQpo5KfgD39TnNySV26QjOh7VFpSyw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://eslint.org/donate"
+			}
+		},
+		"node_modules/@eslint/object-schema": {
+			"version": "2.1.6",
+			"resolved": "https://registry.npmjs.org/@eslint/object-schema/-/object-schema-2.1.6.tgz",
+			"integrity": "sha512-RBMg5FRL0I0gs51M/guSAj5/e14VQ4tpZnQNWwuDT66P14I43ItmPfIZRhO9fUVIPOAQXU47atlywZ/czoqFPA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			}
+		},
+		"node_modules/@eslint/plugin-kit": {
+			"version": "0.3.5",
+			"resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.3.5.tgz",
+			"integrity": "sha512-Z5kJ+wU3oA7MMIqVR9tyZRtjYPr4OC004Q4Rw7pgOKUOKkJfZ3O24nz3WYfGRpMDNmcOi3TwQOmgm7B7Tpii0w==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"@eslint/core": "^0.15.2",
+				"levn": "^0.4.1"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			}
+		},
+		"node_modules/@floating-ui/core": {
+			"version": "1.7.2",
+			"resolved": "https://registry.npmjs.org/@floating-ui/core/-/core-1.7.2.tgz",
+			"integrity": "sha512-wNB5ooIKHQc+Kui96jE/n69rHFWAVoxn5CAzL1Xdd8FG03cgY3MLO+GF9U3W737fYDSgPWA6MReKhBQBop6Pcw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@floating-ui/utils": "^0.2.10"
+			}
+		},
+		"node_modules/@floating-ui/dom": {
+			"version": "1.7.2",
+			"resolved": "https://registry.npmjs.org/@floating-ui/dom/-/dom-1.7.2.tgz",
+			"integrity": "sha512-7cfaOQuCS27HD7DX+6ib2OrnW+b4ZBwDNnCcT0uTyidcmyWb03FnQqJybDBoCnpdxwBSfA94UAYlRCt7mV+TbA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@floating-ui/core": "^1.7.2",
+				"@floating-ui/utils": "^0.2.10"
+			}
+		},
+		"node_modules/@floating-ui/utils": {
+			"version": "0.2.10",
+			"resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.2.10.tgz",
+			"integrity": "sha512-aGTxbpbg8/b5JfU1HXSrbH3wXZuLPJcNEcZQFMxLs3oSzgtVu6nFPkbbGGUvBcUjKV2YyB9Wxxabo+HEH9tcRQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@humanfs/core": {
+			"version": "0.19.1",
+			"resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz",
+			"integrity": "sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">=18.18.0"
+			}
+		},
+		"node_modules/@humanfs/node": {
+			"version": "0.16.6",
+			"resolved": "https://registry.npmjs.org/@humanfs/node/-/node-0.16.6.tgz",
+			"integrity": "sha512-YuI2ZHQL78Q5HbhDiBA1X4LmYdXCKCMQIfw0pw7piHJwyREFebJUvrQN4cMssyES6x+vfUbx1CIpaQUKYdQZOw==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"@humanfs/core": "^0.19.1",
+				"@humanwhocodes/retry": "^0.3.0"
+			},
+			"engines": {
+				"node": ">=18.18.0"
+			}
+		},
+		"node_modules/@humanfs/node/node_modules/@humanwhocodes/retry": {
+			"version": "0.3.1",
+			"resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.3.1.tgz",
+			"integrity": "sha512-JBxkERygn7Bv/GbN5Rv8Ul6LVknS+5Bp6RgDC/O8gEBU/yeH5Ui5C/OlWrTb6qct7LjjfT6Re2NxB0ln0yYybA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">=18.18"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/nzakas"
+			}
+		},
+		"node_modules/@humanwhocodes/module-importer": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/@humanwhocodes/module-importer/-/module-importer-1.0.1.tgz",
+			"integrity": "sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">=12.22"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/nzakas"
+			}
+		},
+		"node_modules/@humanwhocodes/retry": {
+			"version": "0.4.3",
+			"resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.4.3.tgz",
+			"integrity": "sha512-bV0Tgo9K4hfPCek+aMAn81RppFKv2ySDQeMoSZuvTASywNTnVJCArCZE2FWqpvIatKu7VMRLWlR1EazvVhDyhQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">=18.18"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/nzakas"
+			}
+		},
+		"node_modules/@internationalized/date": {
+			"version": "3.10.1",
+			"resolved": "https://registry.npmjs.org/@internationalized/date/-/date-3.10.1.tgz",
+			"integrity": "sha512-oJrXtQiAXLvT9clCf1K4kxp3eKsQhIaZqxEyowkBcsvZDdZkbWrVmnGknxs5flTD0VGsxrxKgBCZty1EzoiMzA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"peer": true,
+			"dependencies": {
+				"@swc/helpers": "^0.5.0"
+			}
+		},
+		"node_modules/@isaacs/fs-minipass": {
+			"version": "4.0.1",
+			"resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.1.tgz",
+			"integrity": "sha512-wgm9Ehl2jpeqP3zw/7mo3kRHFp5MEDhqAdwy1fTGkHAwnkGOVsgpvQhL8B5n1qlb01jV3n/bI0ZfZp5lWA1k4w==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"minipass": "^7.0.4"
+			},
+			"engines": {
+				"node": ">=18.0.0"
+			}
+		},
+		"node_modules/@jridgewell/gen-mapping": {
+			"version": "0.3.12",
+			"resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.12.tgz",
+			"integrity": "sha512-OuLGC46TjB5BbN1dH8JULVVZY4WTdkF7tV9Ys6wLL1rubZnCMstOhNHueU5bLCrnRuDhKPDM4g6sw4Bel5Gzqg==",
+			"license": "MIT",
+			"dependencies": {
+				"@jridgewell/sourcemap-codec": "^1.5.0",
+				"@jridgewell/trace-mapping": "^0.3.24"
+			}
+		},
+		"node_modules/@jridgewell/remapping": {
+			"version": "2.3.5",
+			"resolved": "https://registry.npmjs.org/@jridgewell/remapping/-/remapping-2.3.5.tgz",
+			"integrity": "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@jridgewell/gen-mapping": "^0.3.5",
+				"@jridgewell/trace-mapping": "^0.3.24"
+			}
+		},
+		"node_modules/@jridgewell/resolve-uri": {
+			"version": "3.1.2",
+			"resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz",
+			"integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
+			"license": "MIT",
+			"engines": {
+				"node": ">=6.0.0"
+			}
+		},
+		"node_modules/@jridgewell/sourcemap-codec": {
+			"version": "1.5.4",
+			"resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.4.tgz",
+			"integrity": "sha512-VT2+G1VQs/9oz078bLrYbecdZKs912zQlkelYpuf+SXF+QvZDYJlbx/LSx+meSAwdDFnF8FVXW92AVjjkVmgFw==",
+			"license": "MIT"
+		},
+		"node_modules/@jridgewell/trace-mapping": {
+			"version": "0.3.29",
+			"resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.29.tgz",
+			"integrity": "sha512-uw6guiW/gcAGPDhLmd77/6lW8QLeiV5RUTsAX46Db6oLhGaVj4lhnPwb184s1bkc8kdVg/+h988dro8GRDpmYQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@jridgewell/resolve-uri": "^3.1.0",
+				"@jridgewell/sourcemap-codec": "^1.4.14"
+			}
+		},
+		"node_modules/@lucide/svelte": {
+			"version": "0.515.0",
+			"resolved": "https://registry.npmjs.org/@lucide/svelte/-/svelte-0.515.0.tgz",
+			"integrity": "sha512-CEAyqcZmNBfYzVgaRmK2RFJP5tnbXxekRyDk0XX/eZQRfsJmkDvmQwXNX8C869BgNeryzmrRyjHhUL6g9ZOHNA==",
+			"dev": true,
+			"license": "ISC",
+			"peerDependencies": {
+				"svelte": "^5"
+			}
+		},
+		"node_modules/@mdx-js/react": {
+			"version": "3.1.0",
+			"resolved": "https://registry.npmjs.org/@mdx-js/react/-/react-3.1.0.tgz",
+			"integrity": "sha512-QjHtSaoameoalGnKDT3FoIl4+9RwyTmo9ZJGBdLOks/YOiWHoRDI3PUwEzOE7kEmGcV3AFcp9K6dYu9rEuKLAQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdx": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			},
+			"peerDependencies": {
+				"@types/react": ">=16",
+				"react": ">=16"
+			}
+		},
+		"node_modules/@napi-rs/canvas": {
+			"version": "0.1.76",
+			"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.76.tgz",
+			"integrity": "sha512-YIk5okeNN53GzjvWmAyCQFE9xrLeQXzYpudX4TiLvqaz9SqXgIgxIuKPe4DKyB5nccsQMIev7JGKTzZaN5rFdw==",
+			"license": "MIT",
+			"optional": true,
+			"workspaces": [
+				"e2e/*"
+			],
+			"engines": {
+				"node": ">= 10"
+			},
+			"optionalDependencies": {
+				"@napi-rs/canvas-android-arm64": "0.1.76",
+				"@napi-rs/canvas-darwin-arm64": "0.1.76",
+				"@napi-rs/canvas-darwin-x64": "0.1.76",
+				"@napi-rs/canvas-linux-arm-gnueabihf": "0.1.76",
+				"@napi-rs/canvas-linux-arm64-gnu": "0.1.76",
+				"@napi-rs/canvas-linux-arm64-musl": "0.1.76",
+				"@napi-rs/canvas-linux-riscv64-gnu": "0.1.76",
+				"@napi-rs/canvas-linux-x64-gnu": "0.1.76",
+				"@napi-rs/canvas-linux-x64-musl": "0.1.76",
+				"@napi-rs/canvas-win32-x64-msvc": "0.1.76"
+			}
+		},
+		"node_modules/@napi-rs/canvas-android-arm64": {
+			"version": "0.1.76",
+			"resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.76.tgz",
+			"integrity": "sha512-7EAfkLBQo2QoEzpHdInFbfEUYTXsiO2hvtFo1D9zfTzcQM8n5piZdOpJ3EIkmpe8yLoSV8HLyUQtq4bv11x6Tg==",
+			"cpu": [
+				"arm64"
+			],
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"android"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@napi-rs/canvas-darwin-arm64": {
+			"version": "0.1.76",
+			"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.76.tgz",
+			"integrity": "sha512-Cs8WRMzaWSJWeWY8tvnCe+TuduHUbB0xFhZ0FmOrNy2prPxT4A6aU3FQu8hR9XJw8kKZ7v902wzaDmy9SdhG8A==",
+			"cpu": [
+				"arm64"
+			],
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@napi-rs/canvas-darwin-x64": {
+			"version": "0.1.76",
+			"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.76.tgz",
+			"integrity": "sha512-ya+T6gV9XAq7YAnMa2fKhWXAuRR5cpRny2IoHacoMxgtOARnUkJO/k3hIb52FtMoq7UxLi5+IFGVHU6ZiMu4Ag==",
+			"cpu": [
+				"x64"
+			],
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
+			"version": "0.1.76",
+			"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.76.tgz",
+			"integrity": "sha512-fgnPb+FKVuixACvkHGldJqYXExORBwvqGgL0K80uE6SGH2t0UKD2auHw2CtBy14DUzfg82PkupO2ix2w7kB+Xw==",
+			"cpu": [
+				"arm"
+			],
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@napi-rs/canvas-linux-arm64-gnu": {
+			"version": "0.1.76",
+			"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.76.tgz",
+			"integrity": "sha512-r8OxIenvBPOa4I014k1ZWTCz2dB0ZTsxMP7+ovMOKO7jkl1Z+YZo2OTAqxArpMhN0wdEeI3Lw9zUcn2HgwEgDA==",
+			"cpu": [
+				"arm64"
+			],
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@napi-rs/canvas-linux-arm64-musl": {
+			"version": "0.1.76",
+			"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.76.tgz",
+			"integrity": "sha512-smxwzKfHYaOYG7QXUuDPrFEC7WqjL3Lx4AM6mk8/FxDAS+8o0eoZJwSu+zXsaBLimEQUozEYgEGtJ2JJ0RdL4A==",
+			"cpu": [
+				"arm64"
+			],
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
+			"version": "0.1.76",
+			"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.76.tgz",
+			"integrity": "sha512-G2PsFwsP+r4syEoNLStV3n1wtNAClwf8s/qB57bexG08R4f4WaiBd+x+d4iYS0Y5o90YIEm8/ewZn4bLIa0wNQ==",
+			"cpu": [
+				"riscv64"
+			],
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@napi-rs/canvas-linux-x64-gnu": {
+			"version": "0.1.76",
+			"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.76.tgz",
+			"integrity": "sha512-SNK+vgge4DnuONYdYE3Y09LivGgUiUPQDU+PdGNZJIzIi0hRDLcA59eag8LGeQfPmJW84c1aZD04voihybKFog==",
+			"cpu": [
+				"x64"
+			],
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@napi-rs/canvas-linux-x64-musl": {
+			"version": "0.1.76",
+			"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.76.tgz",
+			"integrity": "sha512-tWHLBI9iVoR1NsfpHz1MGERTkqcca8akbH/CzX6JQUNC+lJOeYYXeRuK8hKqMIg1LI+4QOMAtHNVeZu8NvjEug==",
+			"cpu": [
+				"x64"
+			],
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@napi-rs/canvas-win32-x64-msvc": {
+			"version": "0.1.76",
+			"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.76.tgz",
+			"integrity": "sha512-ifM5HOGw2hP5QLQzCB41Riw3Pq5yKAAjZpn+lJC0sYBmyS2s/Kq6KpTOKxf0CuptkI1wMcRcYQfhLRdeWiYvIg==",
+			"cpu": [
+				"x64"
+			],
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@neoconfetti/react": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/@neoconfetti/react/-/react-1.0.0.tgz",
+			"integrity": "sha512-klcSooChXXOzIm+SE5IISIAn3bYzYfPjbX7D7HoqZL84oAfgREeSg5vSIaSFH+DaGzzvImTyWe1OyrJ67vik4A==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@nodelib/fs.scandir": {
+			"version": "2.1.5",
+			"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
+			"integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@nodelib/fs.stat": "2.0.5",
+				"run-parallel": "^1.1.9"
+			},
+			"engines": {
+				"node": ">= 8"
+			}
+		},
+		"node_modules/@nodelib/fs.stat": {
+			"version": "2.0.5",
+			"resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz",
+			"integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 8"
+			}
+		},
+		"node_modules/@nodelib/fs.walk": {
+			"version": "1.2.8",
+			"resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz",
+			"integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@nodelib/fs.scandir": "2.1.5",
+				"fastq": "^1.6.0"
+			},
+			"engines": {
+				"node": ">= 8"
+			}
+		},
+		"node_modules/@parcel/watcher": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher/-/watcher-2.5.1.tgz",
+			"integrity": "sha512-dfUnCxiN9H4ap84DvD2ubjw+3vUNpstxa0TneY/Paat8a3R4uQZDLSvWjmznAY/DoahqTHl9V46HF/Zs3F29pg==",
+			"dev": true,
+			"hasInstallScript": true,
+			"license": "MIT",
+			"optional": true,
+			"dependencies": {
+				"detect-libc": "^1.0.3",
+				"is-glob": "^4.0.3",
+				"micromatch": "^4.0.5",
+				"node-addon-api": "^7.0.0"
+			},
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			},
+			"optionalDependencies": {
+				"@parcel/watcher-android-arm64": "2.5.1",
+				"@parcel/watcher-darwin-arm64": "2.5.1",
+				"@parcel/watcher-darwin-x64": "2.5.1",
+				"@parcel/watcher-freebsd-x64": "2.5.1",
+				"@parcel/watcher-linux-arm-glibc": "2.5.1",
+				"@parcel/watcher-linux-arm-musl": "2.5.1",
+				"@parcel/watcher-linux-arm64-glibc": "2.5.1",
+				"@parcel/watcher-linux-arm64-musl": "2.5.1",
+				"@parcel/watcher-linux-x64-glibc": "2.5.1",
+				"@parcel/watcher-linux-x64-musl": "2.5.1",
+				"@parcel/watcher-win32-arm64": "2.5.1",
+				"@parcel/watcher-win32-ia32": "2.5.1",
+				"@parcel/watcher-win32-x64": "2.5.1"
+			}
+		},
+		"node_modules/@parcel/watcher-android-arm64": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-android-arm64/-/watcher-android-arm64-2.5.1.tgz",
+			"integrity": "sha512-KF8+j9nNbUN8vzOFDpRMsaKBHZ/mcjEjMToVMJOhTozkDonQFFrRcfdLWn6yWKCmJKmdVxSgHiYvTCef4/qcBA==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"android"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher-darwin-arm64": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-darwin-arm64/-/watcher-darwin-arm64-2.5.1.tgz",
+			"integrity": "sha512-eAzPv5osDmZyBhou8PoF4i6RQXAfeKL9tjb3QzYuccXFMQU0ruIc/POh30ePnaOyD1UXdlKguHBmsTs53tVoPw==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher-darwin-x64": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-darwin-x64/-/watcher-darwin-x64-2.5.1.tgz",
+			"integrity": "sha512-1ZXDthrnNmwv10A0/3AJNZ9JGlzrF82i3gNQcWOzd7nJ8aj+ILyW1MTxVk35Db0u91oD5Nlk9MBiujMlwmeXZg==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher-freebsd-x64": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-freebsd-x64/-/watcher-freebsd-x64-2.5.1.tgz",
+			"integrity": "sha512-SI4eljM7Flp9yPuKi8W0ird8TI/JK6CSxju3NojVI6BjHsTyK7zxA9urjVjEKJ5MBYC+bLmMcbAWlZ+rFkLpJQ==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"freebsd"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher-linux-arm-glibc": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm-glibc/-/watcher-linux-arm-glibc-2.5.1.tgz",
+			"integrity": "sha512-RCdZlEyTs8geyBkkcnPWvtXLY44BCeZKmGYRtSgtwwnHR4dxfHRG3gR99XdMEdQ7KeiDdasJwwvNSF5jKtDwdA==",
+			"cpu": [
+				"arm"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher-linux-arm-musl": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm-musl/-/watcher-linux-arm-musl-2.5.1.tgz",
+			"integrity": "sha512-6E+m/Mm1t1yhB8X412stiKFG3XykmgdIOqhjWj+VL8oHkKABfu/gjFj8DvLrYVHSBNC+/u5PeNrujiSQ1zwd1Q==",
+			"cpu": [
+				"arm"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher-linux-arm64-glibc": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm64-glibc/-/watcher-linux-arm64-glibc-2.5.1.tgz",
+			"integrity": "sha512-LrGp+f02yU3BN9A+DGuY3v3bmnFUggAITBGriZHUREfNEzZh/GO06FF5u2kx8x+GBEUYfyTGamol4j3m9ANe8w==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher-linux-arm64-musl": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm64-musl/-/watcher-linux-arm64-musl-2.5.1.tgz",
+			"integrity": "sha512-cFOjABi92pMYRXS7AcQv9/M1YuKRw8SZniCDw0ssQb/noPkRzA+HBDkwmyOJYp5wXcsTrhxO0zq1U11cK9jsFg==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher-linux-x64-glibc": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-linux-x64-glibc/-/watcher-linux-x64-glibc-2.5.1.tgz",
+			"integrity": "sha512-GcESn8NZySmfwlTsIur+49yDqSny2IhPeZfXunQi48DMugKeZ7uy1FX83pO0X22sHntJ4Ub+9k34XQCX+oHt2A==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher-linux-x64-musl": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-linux-x64-musl/-/watcher-linux-x64-musl-2.5.1.tgz",
+			"integrity": "sha512-n0E2EQbatQ3bXhcH2D1XIAANAcTZkQICBPVaxMeaCVBtOpBZpWJuf7LwyWPSBDITb7In8mqQgJ7gH8CILCURXg==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher-win32-arm64": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-win32-arm64/-/watcher-win32-arm64-2.5.1.tgz",
+			"integrity": "sha512-RFzklRvmc3PkjKjry3hLF9wD7ppR4AKcWNzH7kXR7GUe0Igb3Nz8fyPwtZCSquGrhU5HhUNDr/mKBqj7tqA2Vw==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher-win32-ia32": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-win32-ia32/-/watcher-win32-ia32-2.5.1.tgz",
+			"integrity": "sha512-c2KkcVN+NJmuA7CGlaGD1qJh1cLfDnQsHjE89E60vUEMlqduHGCdCLJCID5geFVM0dOtA3ZiIO8BoEQmzQVfpQ==",
+			"cpu": [
+				"ia32"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher-win32-x64": {
+			"version": "2.5.1",
+			"resolved": "https://registry.npmjs.org/@parcel/watcher-win32-x64/-/watcher-win32-x64-2.5.1.tgz",
+			"integrity": "sha512-9lHBdJITeNR++EvSQVUcaZoWupyHfXe1jZvGZ06O/5MflPcuPLtEphScIBL+AiCWBO46tDSHzWyD0uDmmZqsgA==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">= 10.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/@parcel/watcher/node_modules/detect-libc": {
+			"version": "1.0.3",
+			"resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-1.0.3.tgz",
+			"integrity": "sha512-pGjwhsmsp4kL2RTz08wcOlGN83otlqHeD/Z5T8GXZB+/YcpQ/dgo+lbU8ZsGxV0HIvqqxo9l7mqYwyYMD9bKDg==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"optional": true,
+			"bin": {
+				"detect-libc": "bin/detect-libc.js"
+			},
+			"engines": {
+				"node": ">=0.10"
+			}
+		},
+		"node_modules/@playwright/test": {
+			"version": "1.56.1",
+			"resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.56.1.tgz",
+			"integrity": "sha512-vSMYtL/zOcFpvJCW71Q/OEGQb7KYBPAdKh35WNSkaZA75JlAO8ED8UN6GUNTm3drWomcbcqRPFqQbLae8yBTdg==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"playwright": "1.56.1"
+			},
+			"bin": {
+				"playwright": "cli.js"
+			},
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@polka/url": {
+			"version": "1.0.0-next.29",
+			"resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.29.tgz",
+			"integrity": "sha512-wwQAWhWSuHaag8c4q/KN/vCoeOJYshAIvMQwD4GpSb3OiZklFfvAgmj0VCBBImRpuF/aFgIRzllXlVX93Jevww==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@rollup/rollup-android-arm-eabi": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.45.1.tgz",
+			"integrity": "sha512-NEySIFvMY0ZQO+utJkgoMiCAjMrGvnbDLHvcmlA33UXJpYBCvlBEbMMtV837uCkS+plG2umfhn0T5mMAxGrlRA==",
+			"cpu": [
+				"arm"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"android"
+			]
+		},
+		"node_modules/@rollup/rollup-android-arm64": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.45.1.tgz",
+			"integrity": "sha512-ujQ+sMXJkg4LRJaYreaVx7Z/VMgBBd89wGS4qMrdtfUFZ+TSY5Rs9asgjitLwzeIbhwdEhyj29zhst3L1lKsRQ==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"android"
+			]
+		},
+		"node_modules/@rollup/rollup-darwin-arm64": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.45.1.tgz",
+			"integrity": "sha512-FSncqHvqTm3lC6Y13xncsdOYfxGSLnP+73k815EfNmpewPs+EyM49haPS105Rh4aF5mJKywk9X0ogzLXZzN9lA==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			]
+		},
+		"node_modules/@rollup/rollup-darwin-x64": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.45.1.tgz",
+			"integrity": "sha512-2/vVn/husP5XI7Fsf/RlhDaQJ7x9zjvC81anIVbr4b/f0xtSmXQTFcGIQ/B1cXIYM6h2nAhJkdMHTnD7OtQ9Og==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			]
+		},
+		"node_modules/@rollup/rollup-freebsd-arm64": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.45.1.tgz",
+			"integrity": "sha512-4g1kaDxQItZsrkVTdYQ0bxu4ZIQ32cotoQbmsAnW1jAE4XCMbcBPDirX5fyUzdhVCKgPcrwWuucI8yrVRBw2+g==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"freebsd"
+			]
+		},
+		"node_modules/@rollup/rollup-freebsd-x64": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.45.1.tgz",
+			"integrity": "sha512-L/6JsfiL74i3uK1Ti2ZFSNsp5NMiM4/kbbGEcOCps99aZx3g8SJMO1/9Y0n/qKlWZfn6sScf98lEOUe2mBvW9A==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"freebsd"
+			]
+		},
+		"node_modules/@rollup/rollup-linux-arm-gnueabihf": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.45.1.tgz",
+			"integrity": "sha512-RkdOTu2jK7brlu+ZwjMIZfdV2sSYHK2qR08FUWcIoqJC2eywHbXr0L8T/pONFwkGukQqERDheaGTeedG+rra6Q==",
+			"cpu": [
+				"arm"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			]
+		},
+		"node_modules/@rollup/rollup-linux-arm-musleabihf": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.45.1.tgz",
+			"integrity": "sha512-3kJ8pgfBt6CIIr1o+HQA7OZ9mp/zDk3ctekGl9qn/pRBgrRgfwiffaUmqioUGN9hv0OHv2gxmvdKOkARCtRb8Q==",
+			"cpu": [
+				"arm"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			]
+		},
+		"node_modules/@rollup/rollup-linux-arm64-gnu": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.45.1.tgz",
+			"integrity": "sha512-k3dOKCfIVixWjG7OXTCOmDfJj3vbdhN0QYEqB+OuGArOChek22hn7Uy5A/gTDNAcCy5v2YcXRJ/Qcnm4/ma1xw==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			]
+		},
+		"node_modules/@rollup/rollup-linux-arm64-musl": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.45.1.tgz",
+			"integrity": "sha512-PmI1vxQetnM58ZmDFl9/Uk2lpBBby6B6rF4muJc65uZbxCs0EA7hhKCk2PKlmZKuyVSHAyIw3+/SiuMLxKxWog==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			]
+		},
+		"node_modules/@rollup/rollup-linux-loongarch64-gnu": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loongarch64-gnu/-/rollup-linux-loongarch64-gnu-4.45.1.tgz",
+			"integrity": "sha512-9UmI0VzGmNJ28ibHW2GpE2nF0PBQqsyiS4kcJ5vK+wuwGnV5RlqdczVocDSUfGX/Na7/XINRVoUgJyFIgipoRg==",
+			"cpu": [
+				"loong64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			]
+		},
+		"node_modules/@rollup/rollup-linux-powerpc64le-gnu": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.45.1.tgz",
+			"integrity": "sha512-7nR2KY8oEOUTD3pBAxIBBbZr0U7U+R9HDTPNy+5nVVHDXI4ikYniH1oxQz9VoB5PbBU1CZuDGHkLJkd3zLMWsg==",
+			"cpu": [
+				"ppc64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			]
+		},
+		"node_modules/@rollup/rollup-linux-riscv64-gnu": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.45.1.tgz",
+			"integrity": "sha512-nlcl3jgUultKROfZijKjRQLUu9Ma0PeNv/VFHkZiKbXTBQXhpytS8CIj5/NfBeECZtY2FJQubm6ltIxm/ftxpw==",
+			"cpu": [
+				"riscv64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			]
+		},
+		"node_modules/@rollup/rollup-linux-riscv64-musl": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.45.1.tgz",
+			"integrity": "sha512-HJV65KLS51rW0VY6rvZkiieiBnurSzpzore1bMKAhunQiECPuxsROvyeaot/tcK3A3aGnI+qTHqisrpSgQrpgA==",
+			"cpu": [
+				"riscv64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			]
+		},
+		"node_modules/@rollup/rollup-linux-s390x-gnu": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.45.1.tgz",
+			"integrity": "sha512-NITBOCv3Qqc6hhwFt7jLV78VEO/il4YcBzoMGGNxznLgRQf43VQDae0aAzKiBeEPIxnDrACiMgbqjuihx08OOw==",
+			"cpu": [
+				"s390x"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			]
+		},
+		"node_modules/@rollup/rollup-linux-x64-gnu": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.45.1.tgz",
+			"integrity": "sha512-+E/lYl6qu1zqgPEnTrs4WysQtvc/Sh4fC2nByfFExqgYrqkKWp1tWIbe+ELhixnenSpBbLXNi6vbEEJ8M7fiHw==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			]
+		},
+		"node_modules/@rollup/rollup-linux-x64-musl": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.45.1.tgz",
+			"integrity": "sha512-a6WIAp89p3kpNoYStITT9RbTbTnqarU7D8N8F2CV+4Cl9fwCOZraLVuVFvlpsW0SbIiYtEnhCZBPLoNdRkjQFw==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			]
+		},
+		"node_modules/@rollup/rollup-win32-arm64-msvc": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.45.1.tgz",
+			"integrity": "sha512-T5Bi/NS3fQiJeYdGvRpTAP5P02kqSOpqiopwhj0uaXB6nzs5JVi2XMJb18JUSKhCOX8+UE1UKQufyD6Or48dJg==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			]
+		},
+		"node_modules/@rollup/rollup-win32-ia32-msvc": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.45.1.tgz",
+			"integrity": "sha512-lxV2Pako3ujjuUe9jiU3/s7KSrDfH6IgTSQOnDWr9aJ92YsFd7EurmClK0ly/t8dzMkDtd04g60WX6yl0sGfdw==",
+			"cpu": [
+				"ia32"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			]
+		},
+		"node_modules/@rollup/rollup-win32-x64-msvc": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.45.1.tgz",
+			"integrity": "sha512-M/fKi4sasCdM8i0aWJjCSFm2qEnYRR8AMLG2kxp6wD13+tMGA4Z1tVAuHkNRjud5SW2EM3naLuK35w9twvf6aA==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			]
+		},
+		"node_modules/@standard-schema/spec": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.0.0.tgz",
+			"integrity": "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@storybook/addon-a11y": {
+			"version": "10.0.7",
+			"resolved": "https://registry.npmjs.org/@storybook/addon-a11y/-/addon-a11y-10.0.7.tgz",
+			"integrity": "sha512-JsYPpZ/n67/2bI1XJeyrAWHHQkHemPkPHjCA0tAUnMz1Shlo/LV2q1Ahgpxoihx4strbHwZz71bcS4MqkHBduA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@storybook/global": "^5.0.0",
+				"axe-core": "^4.2.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/storybook"
+			},
+			"peerDependencies": {
+				"storybook": "^10.0.7"
+			}
+		},
+		"node_modules/@storybook/addon-docs": {
+			"version": "10.0.7",
+			"resolved": "https://registry.npmjs.org/@storybook/addon-docs/-/addon-docs-10.0.7.tgz",
+			"integrity": "sha512-qQQMoeYZC4W+/8ubfOZiTrE8nYC/f4wWP1uq4peRyDy1N2nIN9SwhyxwMn0m3VpeGmRBga5dLvJY9ko6SnJekg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@mdx-js/react": "^3.0.0",
+				"@storybook/csf-plugin": "10.0.7",
+				"@storybook/icons": "^1.6.0",
+				"@storybook/react-dom-shim": "10.0.7",
+				"react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0",
+				"react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0",
+				"ts-dedent": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/storybook"
+			},
+			"peerDependencies": {
+				"storybook": "^10.0.7"
+			}
+		},
+		"node_modules/@storybook/addon-svelte-csf": {
+			"version": "5.0.10",
+			"resolved": "https://registry.npmjs.org/@storybook/addon-svelte-csf/-/addon-svelte-csf-5.0.10.tgz",
+			"integrity": "sha512-poSvTS7VdaQ42ZoqW5e4+2Hv1iLO0mekH9fwn/QuBNse48R4WlTyR8XFbHRTfatl9gdc9ZYC4uWzazrmV6zGIA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@storybook/csf": "^0.1.13",
+				"dedent": "^1.5.3",
+				"es-toolkit": "^1.26.1",
+				"esrap": "^1.2.2",
+				"magic-string": "^0.30.12",
+				"svelte-ast-print": "^0.4.0",
+				"zimmerframe": "^1.1.2"
+			},
+			"peerDependencies": {
+				"@storybook/svelte": "^0.0.0-0 || ^8.2.0 || ^9.0.0 || ^9.1.0-0 || ^10.0.0-0",
+				"@sveltejs/vite-plugin-svelte": "^4.0.0 || ^5.0.0 || ^6.0.0",
+				"storybook": "^0.0.0-0 || ^8.2.0 || ^9.0.0 || ^9.1.0-0 || ^10.0.0-0",
+				"svelte": "^5.0.0",
+				"vite": "^5.0.0 || ^6.0.0 || ^7.0.0"
+			}
+		},
+		"node_modules/@storybook/addon-vitest": {
+			"version": "10.0.7",
+			"resolved": "https://registry.npmjs.org/@storybook/addon-vitest/-/addon-vitest-10.0.7.tgz",
+			"integrity": "sha512-i6v/mAl+elrUxb+1f4NdnM17t/fg+KGJWL1U9quflXTd3KiLY0xJB4LwNP6yYo7Imc5NIO2fRkJbGvNqLBRe2Q==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@storybook/global": "^5.0.0",
+				"@storybook/icons": "^1.6.0",
+				"prompts": "^2.4.0",
+				"ts-dedent": "^2.2.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/storybook"
+			},
+			"peerDependencies": {
+				"@vitest/browser": "^3.0.0 || ^4.0.0",
+				"@vitest/browser-playwright": "^4.0.0",
+				"@vitest/runner": "^3.0.0 || ^4.0.0",
+				"storybook": "^10.0.7",
+				"vitest": "^3.0.0 || ^4.0.0"
+			},
+			"peerDependenciesMeta": {
+				"@vitest/browser": {
+					"optional": true
+				},
+				"@vitest/browser-playwright": {
+					"optional": true
+				},
+				"@vitest/runner": {
+					"optional": true
+				},
+				"vitest": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/@storybook/builder-vite": {
+			"version": "10.0.7",
+			"resolved": "https://registry.npmjs.org/@storybook/builder-vite/-/builder-vite-10.0.7.tgz",
+			"integrity": "sha512-wk2TAoUY5+9t78GWVBndu9rEo9lo6Ec3SRrLT4VpIlcS2GPK+5f26UC2uvIBwOF/N7JrUUKq/zWDZ3m+do9QDg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@storybook/csf-plugin": "10.0.7",
+				"ts-dedent": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/storybook"
+			},
+			"peerDependencies": {
+				"storybook": "^10.0.7",
+				"vite": "^5.0.0 || ^6.0.0 || ^7.0.0"
+			}
+		},
+		"node_modules/@storybook/csf": {
+			"version": "0.1.13",
+			"resolved": "https://registry.npmjs.org/@storybook/csf/-/csf-0.1.13.tgz",
+			"integrity": "sha512-7xOOwCLGB3ebM87eemep89MYRFTko+D8qE7EdAAq74lgdqRR5cOUtYWJLjO2dLtP94nqoOdHJo6MdLLKzg412Q==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"type-fest": "^2.19.0"
+			}
+		},
+		"node_modules/@storybook/csf-plugin": {
+			"version": "10.0.7",
+			"resolved": "https://registry.npmjs.org/@storybook/csf-plugin/-/csf-plugin-10.0.7.tgz",
+			"integrity": "sha512-YaYYlCyJBwxaMk7yREOdz+9MDSgxIYGdeJ9EIq/bUndmkoj9SRo1P9/0lC5dseWQoiGy4T3PbZiWruD8uM5m3g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"unplugin": "^2.3.5"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/storybook"
+			},
+			"peerDependencies": {
+				"esbuild": "*",
+				"rollup": "*",
+				"storybook": "^10.0.7",
+				"vite": "*",
+				"webpack": "*"
+			},
+			"peerDependenciesMeta": {
+				"esbuild": {
+					"optional": true
+				},
+				"rollup": {
+					"optional": true
+				},
+				"vite": {
+					"optional": true
+				},
+				"webpack": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/@storybook/global": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/@storybook/global/-/global-5.0.0.tgz",
+			"integrity": "sha512-FcOqPAXACP0I3oJ/ws6/rrPT9WGhu915Cg8D02a9YxLo0DE9zI+a9A5gRGvmQ09fiWPukqI8ZAEoQEdWUKMQdQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@storybook/icons": {
+			"version": "1.6.0",
+			"resolved": "https://registry.npmjs.org/@storybook/icons/-/icons-1.6.0.tgz",
+			"integrity": "sha512-hcFZIjW8yQz8O8//2WTIXylm5Xsgc+lW9ISLgUk1xGmptIJQRdlhVIXCpSyLrQaaRiyhQRaVg7l3BD9S216BHw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=14.0.0"
+			},
+			"peerDependencies": {
+				"react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0-beta",
+				"react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0-beta"
+			}
+		},
+		"node_modules/@storybook/react-dom-shim": {
+			"version": "10.0.7",
+			"resolved": "https://registry.npmjs.org/@storybook/react-dom-shim/-/react-dom-shim-10.0.7.tgz",
+			"integrity": "sha512-bp4OnMtZGwPJQDqNRi4K5iibLbZ2TZZMkWW7oSw5jjPFpGSreSjCe8LH9yj/lDnK8Ox9bGMCBFE5RV5XuML29w==",
+			"dev": true,
+			"license": "MIT",
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/storybook"
+			},
+			"peerDependencies": {
+				"react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0",
+				"react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0",
+				"storybook": "^10.0.7"
+			}
+		},
+		"node_modules/@storybook/svelte": {
+			"version": "10.0.7",
+			"resolved": "https://registry.npmjs.org/@storybook/svelte/-/svelte-10.0.7.tgz",
+			"integrity": "sha512-rO+YQhHucy47Vh67z318pALmd6x+K1Kj30Fb4a6oOEw4xn4zCo9KTmkMWs24c4oduEXD/eJu3badlRmsVXzyfA==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"ts-dedent": "^2.0.0",
+				"type-fest": "~2.19"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/storybook"
+			},
+			"peerDependencies": {
+				"storybook": "^10.0.7",
+				"svelte": "^5.0.0"
+			}
+		},
+		"node_modules/@storybook/svelte-vite": {
+			"version": "10.0.7",
+			"resolved": "https://registry.npmjs.org/@storybook/svelte-vite/-/svelte-vite-10.0.7.tgz",
+			"integrity": "sha512-q9/RtrhX1CnznO6AO9MDEy1bsccbGeRxW28FLpgUrztV4IGZ/dFUrFIFurKRyuA3/nFsbtzp1F5jFt3RExmmTw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@storybook/builder-vite": "10.0.7",
+				"@storybook/svelte": "10.0.7",
+				"magic-string": "^0.30.0",
+				"svelte2tsx": "^0.7.44",
+				"typescript": "^4.9.4 || ^5.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/storybook"
+			},
+			"peerDependencies": {
+				"@sveltejs/vite-plugin-svelte": "^2.0.0 || ^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0",
+				"storybook": "^10.0.7",
+				"svelte": "^5.0.0",
+				"vite": "^5.0.0 || ^6.0.0 || ^7.0.0"
+			}
+		},
+		"node_modules/@storybook/sveltekit": {
+			"version": "10.0.7",
+			"resolved": "https://registry.npmjs.org/@storybook/sveltekit/-/sveltekit-10.0.7.tgz",
+			"integrity": "sha512-ujTW7PfWvgBrzd7jzaZe9JgjUeM5YvBKm+xru6t7Dr4bdfmkKqlZHPRdXn/sy+fQNyfg6JL2WKy2KIIeA+RvSg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@storybook/builder-vite": "10.0.7",
+				"@storybook/svelte": "10.0.7",
+				"@storybook/svelte-vite": "10.0.7"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/storybook"
+			},
+			"peerDependencies": {
+				"storybook": "^10.0.7",
+				"svelte": "^5.0.0",
+				"vite": "^5.0.0 || ^6.0.0 || ^7.0.0"
+			}
+		},
+		"node_modules/@sveltejs/acorn-typescript": {
+			"version": "1.0.5",
+			"resolved": "https://registry.npmjs.org/@sveltejs/acorn-typescript/-/acorn-typescript-1.0.5.tgz",
+			"integrity": "sha512-IwQk4yfwLdibDlrXVE04jTZYlLnwsTT2PIOQQGNLWfjavGifnk1JD1LcZjZaBTRcxZu2FfPfNLOE04DSu9lqtQ==",
+			"license": "MIT",
+			"peerDependencies": {
+				"acorn": "^8.9.0"
+			}
+		},
+		"node_modules/@sveltejs/adapter-static": {
+			"version": "3.0.10",
+			"resolved": "https://registry.npmjs.org/@sveltejs/adapter-static/-/adapter-static-3.0.10.tgz",
+			"integrity": "sha512-7D9lYFWJmB7zxZyTE/qxjksvMqzMuYrrsyh1f4AlZqeZeACPRySjbC3aFiY55wb1tWUaKOQG9PVbm74JcN2Iew==",
+			"dev": true,
+			"license": "MIT",
+			"peerDependencies": {
+				"@sveltejs/kit": "^2.0.0"
+			}
+		},
+		"node_modules/@sveltejs/kit": {
+			"version": "2.49.2",
+			"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.49.2.tgz",
+			"integrity": "sha512-Vp3zX/qlwerQmHMP6x0Ry1oY7eKKRcOWGc2P59srOp4zcqyn+etJyQpELgOi4+ZSUgteX8Y387NuwruLgGXLUQ==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@standard-schema/spec": "^1.0.0",
+				"@sveltejs/acorn-typescript": "^1.0.5",
+				"@types/cookie": "^0.6.0",
+				"acorn": "^8.14.1",
+				"cookie": "^0.6.0",
+				"devalue": "^5.3.2",
+				"esm-env": "^1.2.2",
+				"kleur": "^4.1.5",
+				"magic-string": "^0.30.5",
+				"mrmime": "^2.0.0",
+				"sade": "^1.8.1",
+				"set-cookie-parser": "^2.6.0",
+				"sirv": "^3.0.0"
+			},
+			"bin": {
+				"svelte-kit": "svelte-kit.js"
+			},
+			"engines": {
+				"node": ">=18.13"
+			},
+			"peerDependencies": {
+				"@opentelemetry/api": "^1.0.0",
+				"@sveltejs/vite-plugin-svelte": "^3.0.0 || ^4.0.0-next.1 || ^5.0.0 || ^6.0.0-next.0",
+				"svelte": "^4.0.0 || ^5.0.0-next.0",
+				"vite": "^5.0.3 || ^6.0.0 || ^7.0.0-beta.0"
+			},
+			"peerDependenciesMeta": {
+				"@opentelemetry/api": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/@sveltejs/vite-plugin-svelte": {
+			"version": "6.2.1",
+			"resolved": "https://registry.npmjs.org/@sveltejs/vite-plugin-svelte/-/vite-plugin-svelte-6.2.1.tgz",
+			"integrity": "sha512-YZs/OSKOQAQCnJvM/P+F1URotNnYNeU3P2s4oIpzm1uFaqUEqRxUB0g5ejMjEb5Gjb9/PiBI5Ktrq4rUUF8UVQ==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@sveltejs/vite-plugin-svelte-inspector": "^5.0.0",
+				"debug": "^4.4.1",
+				"deepmerge": "^4.3.1",
+				"magic-string": "^0.30.17",
+				"vitefu": "^1.1.1"
+			},
+			"engines": {
+				"node": "^20.19 || ^22.12 || >=24"
+			},
+			"peerDependencies": {
+				"svelte": "^5.0.0",
+				"vite": "^6.3.0 || ^7.0.0"
+			}
+		},
+		"node_modules/@sveltejs/vite-plugin-svelte-inspector": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/@sveltejs/vite-plugin-svelte-inspector/-/vite-plugin-svelte-inspector-5.0.0.tgz",
+			"integrity": "sha512-iwQ8Z4ET6ZFSt/gC+tVfcsSBHwsqc6RumSaiLUkAurW3BCpJam65cmHw0oOlDMTO0u+PZi9hilBRYN+LZNHTUQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"debug": "^4.4.1"
+			},
+			"engines": {
+				"node": "^20.19 || ^22.12 || >=24"
+			},
+			"peerDependencies": {
+				"@sveltejs/vite-plugin-svelte": "^6.0.0-next.0",
+				"svelte": "^5.0.0",
+				"vite": "^6.3.0 || ^7.0.0"
+			}
+		},
+		"node_modules/@swc/helpers": {
+			"version": "0.5.17",
+			"resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.17.tgz",
+			"integrity": "sha512-5IKx/Y13RsYd+sauPb2x+U/xZikHjolzfuDgTAl/Tdf3Q8rslRvC19NKDLgAJQ6wsqADk10ntlv08nPFw/gO/A==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"tslib": "^2.8.0"
+			}
+		},
+		"node_modules/@tailwindcss/forms": {
+			"version": "0.5.10",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/forms/-/forms-0.5.10.tgz",
+			"integrity": "sha512-utI1ONF6uf/pPNO68kmN1b8rEwNXv3czukalo8VtJH8ksIkZXr3Q3VYudZLkCsDd4Wku120uF02hYK25XGPorw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"mini-svg-data-uri": "^1.2.3"
+			},
+			"peerDependencies": {
+				"tailwindcss": ">=3.0.0 || >= 3.0.0-alpha.1 || >= 4.0.0-alpha.20 || >= 4.0.0-beta.1"
+			}
+		},
+		"node_modules/@tailwindcss/node": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/node/-/node-4.1.11.tgz",
+			"integrity": "sha512-yzhzuGRmv5QyU9qLNg4GTlYI6STedBWRE7NjxP45CsFYYq9taI0zJXZBMqIC/c8fViNLhmrbpSFS57EoxUmD6Q==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@ampproject/remapping": "^2.3.0",
+				"enhanced-resolve": "^5.18.1",
+				"jiti": "^2.4.2",
+				"lightningcss": "1.30.1",
+				"magic-string": "^0.30.17",
+				"source-map-js": "^1.2.1",
+				"tailwindcss": "4.1.11"
+			}
+		},
+		"node_modules/@tailwindcss/oxide": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.11.tgz",
+			"integrity": "sha512-Q69XzrtAhuyfHo+5/HMgr1lAiPP/G40OMFAnws7xcFEYqcypZmdW8eGXaOUIeOl1dzPJBPENXgbjsOyhg2nkrg==",
+			"dev": true,
+			"hasInstallScript": true,
+			"license": "MIT",
+			"dependencies": {
+				"detect-libc": "^2.0.4",
+				"tar": "^7.4.3"
+			},
+			"engines": {
+				"node": ">= 10"
+			},
+			"optionalDependencies": {
+				"@tailwindcss/oxide-android-arm64": "4.1.11",
+				"@tailwindcss/oxide-darwin-arm64": "4.1.11",
+				"@tailwindcss/oxide-darwin-x64": "4.1.11",
+				"@tailwindcss/oxide-freebsd-x64": "4.1.11",
+				"@tailwindcss/oxide-linux-arm-gnueabihf": "4.1.11",
+				"@tailwindcss/oxide-linux-arm64-gnu": "4.1.11",
+				"@tailwindcss/oxide-linux-arm64-musl": "4.1.11",
+				"@tailwindcss/oxide-linux-x64-gnu": "4.1.11",
+				"@tailwindcss/oxide-linux-x64-musl": "4.1.11",
+				"@tailwindcss/oxide-wasm32-wasi": "4.1.11",
+				"@tailwindcss/oxide-win32-arm64-msvc": "4.1.11",
+				"@tailwindcss/oxide-win32-x64-msvc": "4.1.11"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-android-arm64": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide-android-arm64/-/oxide-android-arm64-4.1.11.tgz",
+			"integrity": "sha512-3IfFuATVRUMZZprEIx9OGDjG3Ou3jG4xQzNTvjDoKmU9JdmoCohQJ83MYd0GPnQIu89YoJqvMM0G3uqLRFtetg==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"android"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-darwin-arm64": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-arm64/-/oxide-darwin-arm64-4.1.11.tgz",
+			"integrity": "sha512-ESgStEOEsyg8J5YcMb1xl8WFOXfeBmrhAwGsFxxB2CxY9evy63+AtpbDLAyRkJnxLy2WsD1qF13E97uQyP1lfQ==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-darwin-x64": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-x64/-/oxide-darwin-x64-4.1.11.tgz",
+			"integrity": "sha512-EgnK8kRchgmgzG6jE10UQNaH9Mwi2n+yw1jWmof9Vyg2lpKNX2ioe7CJdf9M5f8V9uaQxInenZkOxnTVL3fhAw==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-freebsd-x64": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide-freebsd-x64/-/oxide-freebsd-x64-4.1.11.tgz",
+			"integrity": "sha512-xdqKtbpHs7pQhIKmqVpxStnY1skuNh4CtbcyOHeX1YBE0hArj2romsFGb6yUmzkq/6M24nkxDqU8GYrKrz+UcA==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"freebsd"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-linux-arm-gnueabihf": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm-gnueabihf/-/oxide-linux-arm-gnueabihf-4.1.11.tgz",
+			"integrity": "sha512-ryHQK2eyDYYMwB5wZL46uoxz2zzDZsFBwfjssgB7pzytAeCCa6glsiJGjhTEddq/4OsIjsLNMAiMlHNYnkEEeg==",
+			"cpu": [
+				"arm"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-linux-arm64-gnu": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-gnu/-/oxide-linux-arm64-gnu-4.1.11.tgz",
+			"integrity": "sha512-mYwqheq4BXF83j/w75ewkPJmPZIqqP1nhoghS9D57CLjsh3Nfq0m4ftTotRYtGnZd3eCztgbSPJ9QhfC91gDZQ==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-linux-arm64-musl": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-musl/-/oxide-linux-arm64-musl-4.1.11.tgz",
+			"integrity": "sha512-m/NVRFNGlEHJrNVk3O6I9ggVuNjXHIPoD6bqay/pubtYC9QIdAMpS+cswZQPBLvVvEF6GtSNONbDkZrjWZXYNQ==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-linux-x64-gnu": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-gnu/-/oxide-linux-x64-gnu-4.1.11.tgz",
+			"integrity": "sha512-YW6sblI7xukSD2TdbbaeQVDysIm/UPJtObHJHKxDEcW2exAtY47j52f8jZXkqE1krdnkhCMGqP3dbniu1Te2Fg==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-linux-x64-musl": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-musl/-/oxide-linux-x64-musl-4.1.11.tgz",
+			"integrity": "sha512-e3C/RRhGunWYNC3aSF7exsQkdXzQ/M+aYuZHKnw4U7KQwTJotnWsGOIVih0s2qQzmEzOFIJ3+xt7iq67K/p56Q==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-wasm32-wasi": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide-wasm32-wasi/-/oxide-wasm32-wasi-4.1.11.tgz",
+			"integrity": "sha512-Xo1+/GU0JEN/C/dvcammKHzeM6NqKovG+6921MR6oadee5XPBaKOumrJCXvopJ/Qb5TH7LX/UAywbqrP4lax0g==",
+			"bundleDependencies": [
+				"@napi-rs/wasm-runtime",
+				"@emnapi/core",
+				"@emnapi/runtime",
+				"@tybys/wasm-util",
+				"@emnapi/wasi-threads",
+				"tslib"
+			],
+			"cpu": [
+				"wasm32"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"dependencies": {
+				"@emnapi/core": "^1.4.3",
+				"@emnapi/runtime": "^1.4.3",
+				"@emnapi/wasi-threads": "^1.0.2",
+				"@napi-rs/wasm-runtime": "^0.2.11",
+				"@tybys/wasm-util": "^0.9.0",
+				"tslib": "^2.8.0"
+			},
+			"engines": {
+				"node": ">=14.0.0"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@emnapi/core": {
+			"version": "1.4.3",
+			"dev": true,
+			"inBundle": true,
+			"license": "MIT",
+			"optional": true,
+			"dependencies": {
+				"@emnapi/wasi-threads": "1.0.2",
+				"tslib": "^2.4.0"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@emnapi/runtime": {
+			"version": "1.4.3",
+			"dev": true,
+			"inBundle": true,
+			"license": "MIT",
+			"optional": true,
+			"dependencies": {
+				"tslib": "^2.4.0"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@emnapi/wasi-threads": {
+			"version": "1.0.2",
+			"dev": true,
+			"inBundle": true,
+			"license": "MIT",
+			"optional": true,
+			"dependencies": {
+				"tslib": "^2.4.0"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@napi-rs/wasm-runtime": {
+			"version": "0.2.11",
+			"dev": true,
+			"inBundle": true,
+			"license": "MIT",
+			"optional": true,
+			"dependencies": {
+				"@emnapi/core": "^1.4.3",
+				"@emnapi/runtime": "^1.4.3",
+				"@tybys/wasm-util": "^0.9.0"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@tybys/wasm-util": {
+			"version": "0.9.0",
+			"dev": true,
+			"inBundle": true,
+			"license": "MIT",
+			"optional": true,
+			"dependencies": {
+				"tslib": "^2.4.0"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/tslib": {
+			"version": "2.8.0",
+			"dev": true,
+			"inBundle": true,
+			"license": "0BSD",
+			"optional": true
+		},
+		"node_modules/@tailwindcss/oxide-win32-arm64-msvc": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-arm64-msvc/-/oxide-win32-arm64-msvc-4.1.11.tgz",
+			"integrity": "sha512-UgKYx5PwEKrac3GPNPf6HVMNhUIGuUh4wlDFR2jYYdkX6pL/rn73zTq/4pzUm8fOjAn5L8zDeHp9iXmUGOXZ+w==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@tailwindcss/oxide-win32-x64-msvc": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-x64-msvc/-/oxide-win32-x64-msvc-4.1.11.tgz",
+			"integrity": "sha512-YfHoggn1j0LK7wR82TOucWc5LDCguHnoS879idHekmmiR7g9HUtMw9MI0NHatS28u/Xlkfi9w5RJWgz2Dl+5Qg==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">= 10"
+			}
+		},
+		"node_modules/@tailwindcss/typography": {
+			"version": "0.5.16",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/typography/-/typography-0.5.16.tgz",
+			"integrity": "sha512-0wDLwCVF5V3x3b1SGXPCDcdsbDHMBe+lkFzBRaHeLvNi+nrrnZ1lA18u+OTWO8iSWU2GxUOCvlXtDuqftc1oiA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"lodash.castarray": "^4.4.0",
+				"lodash.isplainobject": "^4.0.6",
+				"lodash.merge": "^4.6.2",
+				"postcss-selector-parser": "6.0.10"
+			},
+			"peerDependencies": {
+				"tailwindcss": ">=3.0.0 || insiders || >=4.0.0-alpha.20 || >=4.0.0-beta.1"
+			}
+		},
+		"node_modules/@tailwindcss/vite": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/@tailwindcss/vite/-/vite-4.1.11.tgz",
+			"integrity": "sha512-RHYhrR3hku0MJFRV+fN2gNbDNEh3dwKvY8XJvTxCSXeMOsCRSr+uKvDWQcbizrHgjML6ZmTE5OwMrl5wKcujCw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@tailwindcss/node": "4.1.11",
+				"@tailwindcss/oxide": "4.1.11",
+				"tailwindcss": "4.1.11"
+			},
+			"peerDependencies": {
+				"vite": "^5.2.0 || ^6 || ^7"
+			}
+		},
+		"node_modules/@testing-library/dom": {
+			"version": "10.4.0",
+			"resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.0.tgz",
+			"integrity": "sha512-pemlzrSESWbdAloYml3bAJMEfNh1Z7EduzqPKprCH5S341frlpYnUEW0H72dLxa6IsYr+mPno20GiSm+h9dEdQ==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@babel/code-frame": "^7.10.4",
+				"@babel/runtime": "^7.12.5",
+				"@types/aria-query": "^5.0.1",
+				"aria-query": "5.3.0",
+				"chalk": "^4.1.0",
+				"dom-accessibility-api": "^0.5.9",
+				"lz-string": "^1.5.0",
+				"pretty-format": "^27.0.2"
+			},
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/@testing-library/jest-dom": {
+			"version": "6.6.3",
+			"resolved": "https://registry.npmjs.org/@testing-library/jest-dom/-/jest-dom-6.6.3.tgz",
+			"integrity": "sha512-IteBhl4XqYNkM54f4ejhLRJiZNqcSCoXUOG2CPK7qbD322KjQozM4kHQOfkG2oln9b9HTYqs+Sae8vBATubxxA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@adobe/css-tools": "^4.4.0",
+				"aria-query": "^5.0.0",
+				"chalk": "^3.0.0",
+				"css.escape": "^1.5.1",
+				"dom-accessibility-api": "^0.6.3",
+				"lodash": "^4.17.21",
+				"redent": "^3.0.0"
+			},
+			"engines": {
+				"node": ">=14",
+				"npm": ">=6",
+				"yarn": ">=1"
+			}
+		},
+		"node_modules/@testing-library/jest-dom/node_modules/chalk": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/chalk/-/chalk-3.0.0.tgz",
+			"integrity": "sha512-4D3B6Wf41KOYRFdszmDqMCGq5VV/uMAB273JILmO+3jAlh8X4qDtdtgCR3fxtbLEMzSx22QdhnDcJvu2u1fVwg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ansi-styles": "^4.1.0",
+				"supports-color": "^7.1.0"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/@testing-library/jest-dom/node_modules/dom-accessibility-api": {
+			"version": "0.6.3",
+			"resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.6.3.tgz",
+			"integrity": "sha512-7ZgogeTnjuHbo+ct10G9Ffp0mif17idi0IyWNVA/wcwcm7NPOD/WEHVP3n7n3MhXqxoIYm8d6MuZohYWIZ4T3w==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@testing-library/user-event": {
+			"version": "14.6.1",
+			"resolved": "https://registry.npmjs.org/@testing-library/user-event/-/user-event-14.6.1.tgz",
+			"integrity": "sha512-vq7fv0rnt+QTXgPxr5Hjc210p6YKq2kmdziLgnsZGgLJ9e6VAShx1pACLuRjd/AS/sr7phAR58OIIpf0LlmQNw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=12",
+				"npm": ">=6"
+			},
+			"peerDependencies": {
+				"@testing-library/dom": ">=7.21.4"
+			}
+		},
+		"node_modules/@types/aria-query": {
+			"version": "5.0.4",
+			"resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz",
+			"integrity": "sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@types/chai": {
+			"version": "5.2.2",
+			"resolved": "https://registry.npmjs.org/@types/chai/-/chai-5.2.2.tgz",
+			"integrity": "sha512-8kB30R7Hwqf40JPiKhVzodJs2Qc1ZJ5zuT3uzw5Hq/dhNCl3G3l83jfpdI1e20BP348+fV7VIL/+FxaXkqBmWg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/deep-eql": "*"
+			}
+		},
+		"node_modules/@types/cookie": {
+			"version": "0.6.0",
+			"resolved": "https://registry.npmjs.org/@types/cookie/-/cookie-0.6.0.tgz",
+			"integrity": "sha512-4Kh9a6B2bQciAhf7FSuMRRkUWecJgJu9nPnx3yzpsfXX/c50REIqpHY4C82bXP90qrLtXtkDxTZosYO3UpOwlA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@types/debug": {
+			"version": "4.1.12",
+			"resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz",
+			"integrity": "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/ms": "*"
+			}
+		},
+		"node_modules/@types/deep-eql": {
+			"version": "4.0.2",
+			"resolved": "https://registry.npmjs.org/@types/deep-eql/-/deep-eql-4.0.2.tgz",
+			"integrity": "sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@types/estree": {
+			"version": "1.0.8",
+			"resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
+			"integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
+			"license": "MIT"
+		},
+		"node_modules/@types/hast": {
+			"version": "3.0.4",
+			"resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
+			"integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "*"
+			}
+		},
+		"node_modules/@types/json-schema": {
+			"version": "7.0.15",
+			"resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
+			"integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@types/katex": {
+			"version": "0.16.7",
+			"resolved": "https://registry.npmjs.org/@types/katex/-/katex-0.16.7.tgz",
+			"integrity": "sha512-HMwFiRujE5PjrgwHQ25+bsLJgowjGjm5Z8FVSf0N6PwgJrwxH0QxzHYDcKsTfV3wva0vzrpqMTJS2jXPr5BMEQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@types/mdast": {
+			"version": "4.0.4",
+			"resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-4.0.4.tgz",
+			"integrity": "sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "*"
+			}
+		},
+		"node_modules/@types/mdx": {
+			"version": "2.0.13",
+			"resolved": "https://registry.npmjs.org/@types/mdx/-/mdx-2.0.13.tgz",
+			"integrity": "sha512-+OWZQfAYyio6YkJb3HLxDrvnx6SWWDbC0zVPfBRzUk0/nqoDyf6dNxQi3eArPe8rJ473nobTMQ/8Zk+LxJ+Yuw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/@types/ms": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz",
+			"integrity": "sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==",
+			"license": "MIT"
+		},
+		"node_modules/@types/node": {
+			"version": "22.16.5",
+			"resolved": "https://registry.npmjs.org/@types/node/-/node-22.16.5.tgz",
+			"integrity": "sha512-bJFoMATwIGaxxx8VJPeM8TonI8t579oRvgAuT8zFugJsJZgzqv0Fu8Mhp68iecjzG7cnN3mO2dJQ5uUM2EFrgQ==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"undici-types": "~6.21.0"
+			}
+		},
+		"node_modules/@types/react": {
+			"version": "19.1.8",
+			"resolved": "https://registry.npmjs.org/@types/react/-/react-19.1.8.tgz",
+			"integrity": "sha512-AwAfQ2Wa5bCx9WP8nZL2uMZWod7J7/JSplxbTmBQ5ms6QpqNYm672H0Vu9ZVKVngQ+ii4R/byguVEUZQyeg44g==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"csstype": "^3.0.2"
+			}
+		},
+		"node_modules/@types/unist": {
+			"version": "2.0.11",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+			"integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+			"license": "MIT"
+		},
+		"node_modules/@typescript-eslint/eslint-plugin": {
+			"version": "8.37.0",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.37.0.tgz",
+			"integrity": "sha512-jsuVWeIkb6ggzB+wPCsR4e6loj+rM72ohW6IBn2C+5NCvfUVY8s33iFPySSVXqtm5Hu29Ne/9bnA0JmyLmgenA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@eslint-community/regexpp": "^4.10.0",
+				"@typescript-eslint/scope-manager": "8.37.0",
+				"@typescript-eslint/type-utils": "8.37.0",
+				"@typescript-eslint/utils": "8.37.0",
+				"@typescript-eslint/visitor-keys": "8.37.0",
+				"graphemer": "^1.4.0",
+				"ignore": "^7.0.0",
+				"natural-compare": "^1.4.0",
+				"ts-api-utils": "^2.1.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"@typescript-eslint/parser": "^8.37.0",
+				"eslint": "^8.57.0 || ^9.0.0",
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/eslint-plugin/node_modules/ignore": {
+			"version": "7.0.5",
+			"resolved": "https://registry.npmjs.org/ignore/-/ignore-7.0.5.tgz",
+			"integrity": "sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 4"
+			}
+		},
+		"node_modules/@typescript-eslint/parser": {
+			"version": "8.37.0",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-8.37.0.tgz",
+			"integrity": "sha512-kVIaQE9vrN9RLCQMQ3iyRlVJpTiDUY6woHGb30JDkfJErqrQEmtdWH3gV0PBAfGZgQXoqzXOO0T3K6ioApbbAA==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@typescript-eslint/scope-manager": "8.37.0",
+				"@typescript-eslint/types": "8.37.0",
+				"@typescript-eslint/typescript-estree": "8.37.0",
+				"@typescript-eslint/visitor-keys": "8.37.0",
+				"debug": "^4.3.4"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"eslint": "^8.57.0 || ^9.0.0",
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/project-service": {
+			"version": "8.37.0",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/project-service/-/project-service-8.37.0.tgz",
+			"integrity": "sha512-BIUXYsbkl5A1aJDdYJCBAo8rCEbAvdquQ8AnLb6z5Lp1u3x5PNgSSx9A/zqYc++Xnr/0DVpls8iQ2cJs/izTXA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/tsconfig-utils": "^8.37.0",
+				"@typescript-eslint/types": "^8.37.0",
+				"debug": "^4.3.4"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/scope-manager": {
+			"version": "8.37.0",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-8.37.0.tgz",
+			"integrity": "sha512-0vGq0yiU1gbjKob2q691ybTg9JX6ShiVXAAfm2jGf3q0hdP6/BruaFjL/ManAR/lj05AvYCH+5bbVo0VtzmjOA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/types": "8.37.0",
+				"@typescript-eslint/visitor-keys": "8.37.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			}
+		},
+		"node_modules/@typescript-eslint/tsconfig-utils": {
+			"version": "8.37.0",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/tsconfig-utils/-/tsconfig-utils-8.37.0.tgz",
+			"integrity": "sha512-1/YHvAVTimMM9mmlPvTec9NP4bobA1RkDbMydxG8omqwJJLEW/Iy2C4adsAESIXU3WGLXFHSZUU+C9EoFWl4Zg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/type-utils": {
+			"version": "8.37.0",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-8.37.0.tgz",
+			"integrity": "sha512-SPkXWIkVZxhgwSwVq9rqj/4VFo7MnWwVaRNznfQDc/xPYHjXnPfLWn+4L6FF1cAz6e7dsqBeMawgl7QjUMj4Ow==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/types": "8.37.0",
+				"@typescript-eslint/typescript-estree": "8.37.0",
+				"@typescript-eslint/utils": "8.37.0",
+				"debug": "^4.3.4",
+				"ts-api-utils": "^2.1.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"eslint": "^8.57.0 || ^9.0.0",
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/types": {
+			"version": "8.37.0",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.37.0.tgz",
+			"integrity": "sha512-ax0nv7PUF9NOVPs+lmQ7yIE7IQmAf8LGcXbMvHX5Gm+YJUYNAl340XkGnrimxZ0elXyoQJuN5sbg6C4evKA4SQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			}
+		},
+		"node_modules/@typescript-eslint/typescript-estree": {
+			"version": "8.37.0",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-8.37.0.tgz",
+			"integrity": "sha512-zuWDMDuzMRbQOM+bHyU4/slw27bAUEcKSKKs3hcv2aNnc/tvE/h7w60dwVw8vnal2Pub6RT1T7BI8tFZ1fE+yg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/project-service": "8.37.0",
+				"@typescript-eslint/tsconfig-utils": "8.37.0",
+				"@typescript-eslint/types": "8.37.0",
+				"@typescript-eslint/visitor-keys": "8.37.0",
+				"debug": "^4.3.4",
+				"fast-glob": "^3.3.2",
+				"is-glob": "^4.0.3",
+				"minimatch": "^9.0.4",
+				"semver": "^7.6.0",
+				"ts-api-utils": "^2.1.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
+			"integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"balanced-match": "^1.0.0"
+			}
+		},
+		"node_modules/@typescript-eslint/typescript-estree/node_modules/minimatch": {
+			"version": "9.0.5",
+			"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
+			"integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"brace-expansion": "^2.0.1"
+			},
+			"engines": {
+				"node": ">=16 || 14 >=14.17"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/isaacs"
+			}
+		},
+		"node_modules/@typescript-eslint/utils": {
+			"version": "8.37.0",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-8.37.0.tgz",
+			"integrity": "sha512-TSFvkIW6gGjN2p6zbXo20FzCABbyUAuq6tBvNRGsKdsSQ6a7rnV6ADfZ7f4iI3lIiXc4F4WWvtUfDw9CJ9pO5A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@eslint-community/eslint-utils": "^4.7.0",
+				"@typescript-eslint/scope-manager": "8.37.0",
+				"@typescript-eslint/types": "8.37.0",
+				"@typescript-eslint/typescript-estree": "8.37.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"eslint": "^8.57.0 || ^9.0.0",
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/@typescript-eslint/visitor-keys": {
+			"version": "8.37.0",
+			"resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-8.37.0.tgz",
+			"integrity": "sha512-YzfhzcTnZVPiLfP/oeKtDp2evwvHLMe0LOy7oe+hb9KKIumLNohYS9Hgp1ifwpu42YWxhZE8yieggz6JpqO/1w==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/types": "8.37.0",
+				"eslint-visitor-keys": "^4.2.1"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			}
+		},
+		"node_modules/@ungap/structured-clone": {
+			"version": "1.3.0",
+			"resolved": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.3.0.tgz",
+			"integrity": "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g==",
+			"license": "ISC"
+		},
+		"node_modules/@vitest/browser": {
+			"version": "3.2.4",
+			"resolved": "https://registry.npmjs.org/@vitest/browser/-/browser-3.2.4.tgz",
+			"integrity": "sha512-tJxiPrWmzH8a+w9nLKlQMzAKX/7VjFs50MWgcAj7p9XQ7AQ9/35fByFYptgPELyLw+0aixTnC4pUWV+APcZ/kw==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@testing-library/dom": "^10.4.0",
+				"@testing-library/user-event": "^14.6.1",
+				"@vitest/mocker": "3.2.4",
+				"@vitest/utils": "3.2.4",
+				"magic-string": "^0.30.17",
+				"sirv": "^3.0.1",
+				"tinyrainbow": "^2.0.0",
+				"ws": "^8.18.2"
+			},
+			"funding": {
+				"url": "https://opencollective.com/vitest"
+			},
+			"peerDependencies": {
+				"playwright": "*",
+				"vitest": "3.2.4",
+				"webdriverio": "^7.0.0 || ^8.0.0 || ^9.0.0"
+			},
+			"peerDependenciesMeta": {
+				"playwright": {
+					"optional": true
+				},
+				"safaridriver": {
+					"optional": true
+				},
+				"webdriverio": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/@vitest/expect": {
+			"version": "3.2.4",
+			"resolved": "https://registry.npmjs.org/@vitest/expect/-/expect-3.2.4.tgz",
+			"integrity": "sha512-Io0yyORnB6sikFlt8QW5K7slY4OjqNX9jmJQ02QDda8lyM6B5oNgVWoSoKPac8/kgnCUzuHQKrSLtu/uOqqrig==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/chai": "^5.2.2",
+				"@vitest/spy": "3.2.4",
+				"@vitest/utils": "3.2.4",
+				"chai": "^5.2.0",
+				"tinyrainbow": "^2.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/vitest"
+			}
+		},
+		"node_modules/@vitest/mocker": {
+			"version": "3.2.4",
+			"resolved": "https://registry.npmjs.org/@vitest/mocker/-/mocker-3.2.4.tgz",
+			"integrity": "sha512-46ryTE9RZO/rfDd7pEqFl7etuyzekzEhUbTW3BvmeO/BcCMEgq59BKhek3dXDWgAj4oMK6OZi+vRr1wPW6qjEQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@vitest/spy": "3.2.4",
+				"estree-walker": "^3.0.3",
+				"magic-string": "^0.30.17"
+			},
+			"funding": {
+				"url": "https://opencollective.com/vitest"
+			},
+			"peerDependencies": {
+				"msw": "^2.4.9",
+				"vite": "^5.0.0 || ^6.0.0 || ^7.0.0-0"
+			},
+			"peerDependenciesMeta": {
+				"msw": {
+					"optional": true
+				},
+				"vite": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/@vitest/mocker/node_modules/estree-walker": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-3.0.3.tgz",
+			"integrity": "sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/estree": "^1.0.0"
+			}
+		},
+		"node_modules/@vitest/pretty-format": {
+			"version": "3.2.4",
+			"resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-3.2.4.tgz",
+			"integrity": "sha512-IVNZik8IVRJRTr9fxlitMKeJeXFFFN0JaB9PHPGQ8NKQbGpfjlTx9zO4RefN8gp7eqjNy8nyK3NZmBzOPeIxtA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"tinyrainbow": "^2.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/vitest"
+			}
+		},
+		"node_modules/@vitest/runner": {
+			"version": "3.2.4",
+			"resolved": "https://registry.npmjs.org/@vitest/runner/-/runner-3.2.4.tgz",
+			"integrity": "sha512-oukfKT9Mk41LreEW09vt45f8wx7DordoWUZMYdY/cyAk7w5TWkTRCNZYF7sX7n2wB7jyGAl74OxgwhPgKaqDMQ==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@vitest/utils": "3.2.4",
+				"pathe": "^2.0.3",
+				"strip-literal": "^3.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/vitest"
+			}
+		},
+		"node_modules/@vitest/snapshot": {
+			"version": "3.2.4",
+			"resolved": "https://registry.npmjs.org/@vitest/snapshot/-/snapshot-3.2.4.tgz",
+			"integrity": "sha512-dEYtS7qQP2CjU27QBC5oUOxLE/v5eLkGqPE0ZKEIDGMs4vKWe7IjgLOeauHsR0D5YuuycGRO5oSRXnwnmA78fQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@vitest/pretty-format": "3.2.4",
+				"magic-string": "^0.30.17",
+				"pathe": "^2.0.3"
+			},
+			"funding": {
+				"url": "https://opencollective.com/vitest"
+			}
+		},
+		"node_modules/@vitest/spy": {
+			"version": "3.2.4",
+			"resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-3.2.4.tgz",
+			"integrity": "sha512-vAfasCOe6AIK70iP5UD11Ac4siNUNJ9i/9PZ3NKx07sG6sUxeag1LWdNrMWeKKYBLlzuK+Gn65Yd5nyL6ds+nw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"tinyspy": "^4.0.3"
+			},
+			"funding": {
+				"url": "https://opencollective.com/vitest"
+			}
+		},
+		"node_modules/@vitest/utils": {
+			"version": "3.2.4",
+			"resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-3.2.4.tgz",
+			"integrity": "sha512-fB2V0JFrQSMsCo9HiSq3Ezpdv4iYaXRG1Sx8edX3MwxfyNn83mKiGzOcH+Fkxt4MHxr3y42fQi1oeAInqgX2QA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@vitest/pretty-format": "3.2.4",
+				"loupe": "^3.1.4",
+				"tinyrainbow": "^2.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/vitest"
+			}
+		},
+		"node_modules/acorn": {
+			"version": "8.15.0",
+			"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
+			"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
+			"license": "MIT",
+			"peer": true,
+			"bin": {
+				"acorn": "bin/acorn"
+			},
+			"engines": {
+				"node": ">=0.4.0"
+			}
+		},
+		"node_modules/acorn-jsx": {
+			"version": "5.3.2",
+			"resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz",
+			"integrity": "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==",
+			"dev": true,
+			"license": "MIT",
+			"peerDependencies": {
+				"acorn": "^6.0.0 || ^7.0.0 || ^8.0.0"
+			}
+		},
+		"node_modules/ajv": {
+			"version": "6.12.6",
+			"resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
+			"integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"fast-deep-equal": "^3.1.1",
+				"fast-json-stable-stringify": "^2.0.0",
+				"json-schema-traverse": "^0.4.1",
+				"uri-js": "^4.2.2"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/epoberezkin"
+			}
+		},
+		"node_modules/ansi-regex": {
+			"version": "5.0.1",
+			"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+			"integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/ansi-styles": {
+			"version": "4.3.0",
+			"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+			"integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"color-convert": "^2.0.1"
+			},
+			"engines": {
+				"node": ">=8"
+			},
+			"funding": {
+				"url": "https://github.com/chalk/ansi-styles?sponsor=1"
+			}
+		},
+		"node_modules/argparse": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
+			"integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
+			"dev": true,
+			"license": "Python-2.0"
+		},
+		"node_modules/aria-query": {
+			"version": "5.3.0",
+			"resolved": "https://registry.npmjs.org/aria-query/-/aria-query-5.3.0.tgz",
+			"integrity": "sha512-b0P0sZPKtyu8HkeRAfCq0IfURZK+SuwMjY1UXGBU27wpAiTwQAIlq56IbIO+ytk/JjS1fMR14ee5WBBfKi5J6A==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"dequal": "^2.0.3"
+			}
+		},
+		"node_modules/assertion-error": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-2.0.1.tgz",
+			"integrity": "sha512-Izi8RQcffqCeNVgFigKli1ssklIbpHnCYc6AknXGYoB6grJqyeby7jv12JUQgmTAnIDnbck1uxksT4dzN3PWBA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=12"
+			}
+		},
+		"node_modules/ast-types": {
+			"version": "0.16.1",
+			"resolved": "https://registry.npmjs.org/ast-types/-/ast-types-0.16.1.tgz",
+			"integrity": "sha512-6t10qk83GOG8p0vKmaCr8eiilZwO171AvbROMtvvNiwrTly62t+7XkA8RdIIVbpMhCASAsxgAzdRSwh6nw/5Dg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"tslib": "^2.0.1"
+			},
+			"engines": {
+				"node": ">=4"
+			}
+		},
+		"node_modules/async": {
+			"version": "3.2.6",
+			"resolved": "https://registry.npmjs.org/async/-/async-3.2.6.tgz",
+			"integrity": "sha512-htCUDlxyyCLMgaM3xXg0C0LW2xqfuQ6p05pCEIsXuyQ+a1koYKTuBMzRNwmybfLgvJDMd0r1LTn4+E0Ti6C2AA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/axe-core": {
+			"version": "4.10.3",
+			"resolved": "https://registry.npmjs.org/axe-core/-/axe-core-4.10.3.tgz",
+			"integrity": "sha512-Xm7bpRXnDSX2YE2YFfBk2FnF0ep6tmG7xPh8iHee8MIcrgq762Nkce856dYtJYLkuIoYZvGfTs/PbZhideTcEg==",
+			"dev": true,
+			"license": "MPL-2.0",
+			"engines": {
+				"node": ">=4"
+			}
+		},
+		"node_modules/axobject-query": {
+			"version": "4.1.0",
+			"resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-4.1.0.tgz",
+			"integrity": "sha512-qIj0G9wZbMGNLjLmg1PT6v2mE9AH2zlnADJD/2tC6E00hgmhUOfEB6greHPAfLRSufHqROIUTkw6E+M3lH0PTQ==",
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/bail": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz",
+			"integrity": "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/balanced-match": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
+			"integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/basic-auth": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/basic-auth/-/basic-auth-2.0.1.tgz",
+			"integrity": "sha512-NF+epuEdnUYVlGuhaxbbq+dvJttwLnGY+YixlXlME5KpQ5W3CnXA5cVTneY3SPbPDRkcjMbifrwmFYcClgOZeg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"safe-buffer": "5.1.2"
+			},
+			"engines": {
+				"node": ">= 0.8"
+			}
+		},
+		"node_modules/bits-ui": {
+			"version": "2.14.4",
+			"resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.14.4.tgz",
+			"integrity": "sha512-W6kenhnbd/YVvur+DKkaVJ6GldE53eLewur5AhUCqslYQ0vjZr8eWlOfwZnMiPB+PF5HMVqf61vXBvmyrAmPWg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@floating-ui/core": "^1.7.1",
+				"@floating-ui/dom": "^1.7.1",
+				"esm-env": "^1.1.2",
+				"runed": "^0.35.1",
+				"svelte-toolbelt": "^0.10.6",
+				"tabbable": "^6.2.0"
+			},
+			"engines": {
+				"node": ">=20"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/huntabyte"
+			},
+			"peerDependencies": {
+				"@internationalized/date": "^3.8.1",
+				"svelte": "^5.33.0"
+			}
+		},
+		"node_modules/bits-ui/node_modules/runed": {
+			"version": "0.35.1",
+			"resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz",
+			"integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==",
+			"dev": true,
+			"funding": [
+				"https://github.com/sponsors/huntabyte",
+				"https://github.com/sponsors/tglide"
+			],
+			"license": "MIT",
+			"dependencies": {
+				"dequal": "^2.0.3",
+				"esm-env": "^1.0.0",
+				"lz-string": "^1.5.0"
+			},
+			"peerDependencies": {
+				"@sveltejs/kit": "^2.21.0",
+				"svelte": "^5.7.0"
+			},
+			"peerDependenciesMeta": {
+				"@sveltejs/kit": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/bits-ui/node_modules/svelte-toolbelt": {
+			"version": "0.10.6",
+			"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz",
+			"integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==",
+			"dev": true,
+			"funding": [
+				"https://github.com/sponsors/huntabyte"
+			],
+			"dependencies": {
+				"clsx": "^2.1.1",
+				"runed": "^0.35.1",
+				"style-to-object": "^1.0.8"
+			},
+			"engines": {
+				"node": ">=18",
+				"pnpm": ">=8.7.0"
+			},
+			"peerDependencies": {
+				"svelte": "^5.30.2"
+			}
+		},
+		"node_modules/brace-expansion": {
+			"version": "1.1.12",
+			"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
+			"integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"balanced-match": "^1.0.0",
+				"concat-map": "0.0.1"
+			}
+		},
+		"node_modules/braces": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
+			"integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"fill-range": "^7.1.1"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/cac": {
+			"version": "6.7.14",
+			"resolved": "https://registry.npmjs.org/cac/-/cac-6.7.14.tgz",
+			"integrity": "sha512-b6Ilus+c3RrdDk+JhLKUAQfzzgLEPy6wcXqS7f/xe1EETvsDP6GORG7SFuOs6cID5YkqchW/LXZbX5bc8j7ZcQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/call-bind-apply-helpers": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
+			"integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-errors": "^1.3.0",
+				"function-bind": "^1.1.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/call-bound": {
+			"version": "1.0.4",
+			"resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz",
+			"integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind-apply-helpers": "^1.0.2",
+				"get-intrinsic": "^1.3.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/callsites": {
+			"version": "3.1.0",
+			"resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
+			"integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/ccount": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz",
+			"integrity": "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/chai": {
+			"version": "5.2.1",
+			"resolved": "https://registry.npmjs.org/chai/-/chai-5.2.1.tgz",
+			"integrity": "sha512-5nFxhUrX0PqtyogoYOA8IPswy5sZFTOsBFl/9bNsmDLgsxYTzSZQJDPppDnZPTQbzSEm0hqGjWPzRemQCYbD6A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"assertion-error": "^2.0.1",
+				"check-error": "^2.1.1",
+				"deep-eql": "^5.0.1",
+				"loupe": "^3.1.0",
+				"pathval": "^2.0.0"
+			},
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/chalk": {
+			"version": "4.1.2",
+			"resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+			"integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ansi-styles": "^4.1.0",
+				"supports-color": "^7.1.0"
+			},
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/chalk/chalk?sponsor=1"
+			}
+		},
+		"node_modules/character-entities": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/character-entities/-/character-entities-2.0.2.tgz",
+			"integrity": "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/character-entities-html4": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/character-entities-html4/-/character-entities-html4-2.1.0.tgz",
+			"integrity": "sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/character-entities-legacy": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-3.0.0.tgz",
+			"integrity": "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/check-error": {
+			"version": "2.1.1",
+			"resolved": "https://registry.npmjs.org/check-error/-/check-error-2.1.1.tgz",
+			"integrity": "sha512-OAlb+T7V4Op9OwdkjmguYRqncdlx5JiofwOAUkmTF+jNdHwzTaTs4sRAGpzLF3oOz5xAyDGrPgeIDFQmDOTiJw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 16"
+			}
+		},
+		"node_modules/chokidar": {
+			"version": "4.0.3",
+			"resolved": "https://registry.npmjs.org/chokidar/-/chokidar-4.0.3.tgz",
+			"integrity": "sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"readdirp": "^4.0.1"
+			},
+			"engines": {
+				"node": ">= 14.16.0"
+			},
+			"funding": {
+				"url": "https://paulmillr.com/funding/"
+			}
+		},
+		"node_modules/chownr": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz",
+			"integrity": "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==",
+			"dev": true,
+			"license": "BlueOak-1.0.0",
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/chromatic": {
+			"version": "12.2.0",
+			"resolved": "https://registry.npmjs.org/chromatic/-/chromatic-12.2.0.tgz",
+			"integrity": "sha512-GswmBW9ZptAoTns1BMyjbm55Z7EsIJnUvYKdQqXIBZIKbGErmpA+p4c0BYA+nzw5B0M+rb3Iqp1IaH8TFwIQew==",
+			"dev": true,
+			"license": "MIT",
+			"bin": {
+				"chroma": "dist/bin.js",
+				"chromatic": "dist/bin.js",
+				"chromatic-cli": "dist/bin.js"
+			},
+			"peerDependencies": {
+				"@chromatic-com/cypress": "^0.*.* || ^1.0.0",
+				"@chromatic-com/playwright": "^0.*.* || ^1.0.0"
+			},
+			"peerDependenciesMeta": {
+				"@chromatic-com/cypress": {
+					"optional": true
+				},
+				"@chromatic-com/playwright": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/clsx": {
+			"version": "2.1.1",
+			"resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz",
+			"integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==",
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/color-convert": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+			"integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"color-name": "~1.1.4"
+			},
+			"engines": {
+				"node": ">=7.0.0"
+			}
+		},
+		"node_modules/color-name": {
+			"version": "1.1.4",
+			"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+			"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/comma-separated-tokens": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz",
+			"integrity": "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/commander": {
+			"version": "8.3.0",
+			"resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz",
+			"integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 12"
+			}
+		},
+		"node_modules/concat-map": {
+			"version": "0.0.1",
+			"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
+			"integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/cookie": {
+			"version": "0.6.0",
+			"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.6.0.tgz",
+			"integrity": "sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.6"
+			}
+		},
+		"node_modules/corser": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/corser/-/corser-2.0.1.tgz",
+			"integrity": "sha512-utCYNzRSQIZNPIcGZdQc92UVJYAhtGAteCFg0yRaFm8f0P+CPtyGyHXJcGXnffjCybUCEx3FQ2G7U3/o9eIkVQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4.0"
+			}
+		},
+		"node_modules/cross-spawn": {
+			"version": "7.0.6",
+			"resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+			"integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"path-key": "^3.1.0",
+				"shebang-command": "^2.0.0",
+				"which": "^2.0.1"
+			},
+			"engines": {
+				"node": ">= 8"
+			}
+		},
+		"node_modules/css.escape": {
+			"version": "1.5.1",
+			"resolved": "https://registry.npmjs.org/css.escape/-/css.escape-1.5.1.tgz",
+			"integrity": "sha512-YUifsXXuknHlUsmlgyY0PKzgPOr7/FjCePfHNt0jxm83wHZi44VDMQ7/fGNkjY3/jV1MC+1CmZbaHzugyeRtpg==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/cssesc": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz",
+			"integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==",
+			"dev": true,
+			"license": "MIT",
+			"bin": {
+				"cssesc": "bin/cssesc"
+			},
+			"engines": {
+				"node": ">=4"
+			}
+		},
+		"node_modules/csstype": {
+			"version": "3.1.3",
+			"resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
+			"integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/debug": {
+			"version": "4.4.1",
+			"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.1.tgz",
+			"integrity": "sha512-KcKCqiftBJcZr++7ykoDIEwSa3XWowTfNPo92BYxjXiyYEVrUQh2aLyhxBCwww+heortUFxEJYcRzosstTEBYQ==",
+			"license": "MIT",
+			"dependencies": {
+				"ms": "^2.1.3"
+			},
+			"engines": {
+				"node": ">=6.0"
+			},
+			"peerDependenciesMeta": {
+				"supports-color": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/decode-named-character-reference": {
+			"version": "1.2.0",
+			"resolved": "https://registry.npmjs.org/decode-named-character-reference/-/decode-named-character-reference-1.2.0.tgz",
+			"integrity": "sha512-c6fcElNV6ShtZXmsgNgFFV5tVX2PaV4g+MOAkb8eXHvn6sryJBrZa9r0zV6+dtTyoCKxtDy5tyQ5ZwQuidtd+Q==",
+			"license": "MIT",
+			"dependencies": {
+				"character-entities": "^2.0.0"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/dedent": {
+			"version": "1.6.0",
+			"resolved": "https://registry.npmjs.org/dedent/-/dedent-1.6.0.tgz",
+			"integrity": "sha512-F1Z+5UCFpmQUzJa11agbyPVMbpgT/qA3/SKyJ1jyBgm7dUcUEa8v9JwDkerSQXfakBwFljIxhOJqGkjUwZ9FSA==",
+			"dev": true,
+			"license": "MIT",
+			"peerDependencies": {
+				"babel-plugin-macros": "^3.1.0"
+			},
+			"peerDependenciesMeta": {
+				"babel-plugin-macros": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/dedent-js": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/dedent-js/-/dedent-js-1.0.1.tgz",
+			"integrity": "sha512-OUepMozQULMLUmhxS95Vudo0jb0UchLimi3+pQ2plj61Fcy8axbP9hbiD4Sz6DPqn6XG3kfmziVfQ1rSys5AJQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/deep-eql": {
+			"version": "5.0.2",
+			"resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-5.0.2.tgz",
+			"integrity": "sha512-h5k/5U50IJJFpzfL6nO9jaaumfjO/f2NjK/oYB2Djzm4p9L+3T9qWpZqZ2hAbLPuuYq9wrU08WQyBTL5GbPk5Q==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/deep-is": {
+			"version": "0.1.4",
+			"resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz",
+			"integrity": "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/deepmerge": {
+			"version": "4.3.1",
+			"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
+			"integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/dequal": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz",
+			"integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==",
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/detect-libc": {
+			"version": "2.0.4",
+			"resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.4.tgz",
+			"integrity": "sha512-3UDv+G9CsCKO1WKMGw9fwq/SWJYbI0c5Y7LU1AXYoDdbhE2AHQ6N6Nb34sG8Fj7T5APy8qXDCKuuIHd1BR0tVA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/devalue": {
+			"version": "5.6.2",
+			"resolved": "https://registry.npmjs.org/devalue/-/devalue-5.6.2.tgz",
+			"integrity": "sha512-nPRkjWzzDQlsejL1WVifk5rvcFi/y1onBRxjaFMjZeR9mFpqu2gmAZ9xUB9/IEanEP/vBtGeGganC/GO1fmufg==",
+			"license": "MIT"
+		},
+		"node_modules/devlop": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/devlop/-/devlop-1.1.0.tgz",
+			"integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==",
+			"license": "MIT",
+			"dependencies": {
+				"dequal": "^2.0.0"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/dexie": {
+			"version": "4.0.11",
+			"resolved": "https://registry.npmjs.org/dexie/-/dexie-4.0.11.tgz",
+			"integrity": "sha512-SOKO002EqlvBYYKQSew3iymBoN2EQ4BDw/3yprjh7kAfFzjBYkaMNa/pZvcA7HSWlcKSQb9XhPe3wKyQ0x4A8A==",
+			"dev": true,
+			"license": "Apache-2.0"
+		},
+		"node_modules/dom-accessibility-api": {
+			"version": "0.5.16",
+			"resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.5.16.tgz",
+			"integrity": "sha512-X7BJ2yElsnOJ30pZF4uIIDfBEVgF4XEBxL9Bxhy6dnrm5hkzqmsWHGTiHqRiITNhMyFLyAiWndIJP7Z1NTteDg==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/dunder-proto": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
+			"integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind-apply-helpers": "^1.0.1",
+				"es-errors": "^1.3.0",
+				"gopd": "^1.2.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/enhanced-resolve": {
+			"version": "5.18.2",
+			"resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.18.2.tgz",
+			"integrity": "sha512-6Jw4sE1maoRJo3q8MsSIn2onJFbLTOjY9hlx4DZXmOKvLRd1Ok2kXmAGXaafL2+ijsJZ1ClYbl/pmqr9+k4iUQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"graceful-fs": "^4.2.4",
+				"tapable": "^2.2.0"
+			},
+			"engines": {
+				"node": ">=10.13.0"
+			}
+		},
+		"node_modules/entities": {
+			"version": "6.0.1",
+			"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
+			"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"engines": {
+				"node": ">=0.12"
+			},
+			"funding": {
+				"url": "https://github.com/fb55/entities?sponsor=1"
+			}
+		},
+		"node_modules/es-define-property": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
+			"integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/es-errors": {
+			"version": "1.3.0",
+			"resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
+			"integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/es-module-lexer": {
+			"version": "1.7.0",
+			"resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.7.0.tgz",
+			"integrity": "sha512-jEQoCwk8hyb2AZziIOLhDqpm5+2ww5uIE6lkO/6jcOCusfk6LhMHpXXfBLXTZ7Ydyt0j4VoUQv6uGNYbdW+kBA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/es-object-atoms": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
+			"integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-errors": "^1.3.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/es-toolkit": {
+			"version": "1.39.7",
+			"resolved": "https://registry.npmjs.org/es-toolkit/-/es-toolkit-1.39.7.tgz",
+			"integrity": "sha512-ek/wWryKouBrZIjkwW2BFf91CWOIMvoy2AE5YYgUrfWsJQM2Su1LoLtrw8uusEpN9RfqLlV/0FVNjT0WMv8Bxw==",
+			"dev": true,
+			"license": "MIT",
+			"workspaces": [
+				"docs",
+				"benchmarks"
+			]
+		},
+		"node_modules/esbuild": {
+			"version": "0.25.8",
+			"resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.8.tgz",
+			"integrity": "sha512-vVC0USHGtMi8+R4Kz8rt6JhEWLxsv9Rnu/lGYbPR8u47B+DCBksq9JarW0zOO7bs37hyOK1l2/oqtbciutL5+Q==",
+			"dev": true,
+			"hasInstallScript": true,
+			"license": "MIT",
+			"peer": true,
+			"bin": {
+				"esbuild": "bin/esbuild"
+			},
+			"engines": {
+				"node": ">=18"
+			},
+			"optionalDependencies": {
+				"@esbuild/aix-ppc64": "0.25.8",
+				"@esbuild/android-arm": "0.25.8",
+				"@esbuild/android-arm64": "0.25.8",
+				"@esbuild/android-x64": "0.25.8",
+				"@esbuild/darwin-arm64": "0.25.8",
+				"@esbuild/darwin-x64": "0.25.8",
+				"@esbuild/freebsd-arm64": "0.25.8",
+				"@esbuild/freebsd-x64": "0.25.8",
+				"@esbuild/linux-arm": "0.25.8",
+				"@esbuild/linux-arm64": "0.25.8",
+				"@esbuild/linux-ia32": "0.25.8",
+				"@esbuild/linux-loong64": "0.25.8",
+				"@esbuild/linux-mips64el": "0.25.8",
+				"@esbuild/linux-ppc64": "0.25.8",
+				"@esbuild/linux-riscv64": "0.25.8",
+				"@esbuild/linux-s390x": "0.25.8",
+				"@esbuild/linux-x64": "0.25.8",
+				"@esbuild/netbsd-arm64": "0.25.8",
+				"@esbuild/netbsd-x64": "0.25.8",
+				"@esbuild/openbsd-arm64": "0.25.8",
+				"@esbuild/openbsd-x64": "0.25.8",
+				"@esbuild/openharmony-arm64": "0.25.8",
+				"@esbuild/sunos-x64": "0.25.8",
+				"@esbuild/win32-arm64": "0.25.8",
+				"@esbuild/win32-ia32": "0.25.8",
+				"@esbuild/win32-x64": "0.25.8"
+			}
+		},
+		"node_modules/escape-string-regexp": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
+			"integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/eslint": {
+			"version": "9.31.0",
+			"resolved": "https://registry.npmjs.org/eslint/-/eslint-9.31.0.tgz",
+			"integrity": "sha512-QldCVh/ztyKJJZLr4jXNUByx3gR+TDYZCRXEktiZoUR3PGy4qCmSbkxcIle8GEwGpb5JBZazlaJ/CxLidXdEbQ==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@eslint-community/eslint-utils": "^4.2.0",
+				"@eslint-community/regexpp": "^4.12.1",
+				"@eslint/config-array": "^0.21.0",
+				"@eslint/config-helpers": "^0.3.0",
+				"@eslint/core": "^0.15.0",
+				"@eslint/eslintrc": "^3.3.1",
+				"@eslint/js": "9.31.0",
+				"@eslint/plugin-kit": "^0.3.1",
+				"@humanfs/node": "^0.16.6",
+				"@humanwhocodes/module-importer": "^1.0.1",
+				"@humanwhocodes/retry": "^0.4.2",
+				"@types/estree": "^1.0.6",
+				"@types/json-schema": "^7.0.15",
+				"ajv": "^6.12.4",
+				"chalk": "^4.0.0",
+				"cross-spawn": "^7.0.6",
+				"debug": "^4.3.2",
+				"escape-string-regexp": "^4.0.0",
+				"eslint-scope": "^8.4.0",
+				"eslint-visitor-keys": "^4.2.1",
+				"espree": "^10.4.0",
+				"esquery": "^1.5.0",
+				"esutils": "^2.0.2",
+				"fast-deep-equal": "^3.1.3",
+				"file-entry-cache": "^8.0.0",
+				"find-up": "^5.0.0",
+				"glob-parent": "^6.0.2",
+				"ignore": "^5.2.0",
+				"imurmurhash": "^0.1.4",
+				"is-glob": "^4.0.0",
+				"json-stable-stringify-without-jsonify": "^1.0.1",
+				"lodash.merge": "^4.6.2",
+				"minimatch": "^3.1.2",
+				"natural-compare": "^1.4.0",
+				"optionator": "^0.9.3"
+			},
+			"bin": {
+				"eslint": "bin/eslint.js"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://eslint.org/donate"
+			},
+			"peerDependencies": {
+				"jiti": "*"
+			},
+			"peerDependenciesMeta": {
+				"jiti": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/eslint-config-prettier": {
+			"version": "10.1.8",
+			"resolved": "https://registry.npmjs.org/eslint-config-prettier/-/eslint-config-prettier-10.1.8.tgz",
+			"integrity": "sha512-82GZUjRS0p/jganf6q1rEO25VSoHH0hKPCTrgillPjdI/3bgBhAE1QzHrHTizjpRvy6pGAvKjDJtk2pF9NDq8w==",
+			"dev": true,
+			"license": "MIT",
+			"bin": {
+				"eslint-config-prettier": "bin/cli.js"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint-config-prettier"
+			},
+			"peerDependencies": {
+				"eslint": ">=7.0.0"
+			}
+		},
+		"node_modules/eslint-plugin-storybook": {
+			"version": "10.0.7",
+			"resolved": "https://registry.npmjs.org/eslint-plugin-storybook/-/eslint-plugin-storybook-10.0.7.tgz",
+			"integrity": "sha512-qOQq9KdT1jsBgT3qsxUH2n67aj1WR8D1XCoER8Q6yuVlS5TimNwk1mZeWkXVf/o4RQQT6flT2y5cG2gPLZPvJA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/utils": "^8.8.1"
+			},
+			"peerDependencies": {
+				"eslint": ">=8",
+				"storybook": "^10.0.7"
+			}
+		},
+		"node_modules/eslint-plugin-svelte": {
+			"version": "3.11.0",
+			"resolved": "https://registry.npmjs.org/eslint-plugin-svelte/-/eslint-plugin-svelte-3.11.0.tgz",
+			"integrity": "sha512-KliWlkieHyEa65aQIkRwUFfHzT5Cn4u3BQQsu3KlkJOs7c1u7ryn84EWaOjEzilbKgttT4OfBURA8Uc4JBSQIw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@eslint-community/eslint-utils": "^4.6.1",
+				"@jridgewell/sourcemap-codec": "^1.5.0",
+				"esutils": "^2.0.3",
+				"globals": "^16.0.0",
+				"known-css-properties": "^0.37.0",
+				"postcss": "^8.4.49",
+				"postcss-load-config": "^3.1.4",
+				"postcss-safe-parser": "^7.0.0",
+				"semver": "^7.6.3",
+				"svelte-eslint-parser": "^1.3.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ota-meshi"
+			},
+			"peerDependencies": {
+				"eslint": "^8.57.1 || ^9.0.0",
+				"svelte": "^3.37.0 || ^4.0.0 || ^5.0.0"
+			},
+			"peerDependenciesMeta": {
+				"svelte": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/eslint-scope": {
+			"version": "8.4.0",
+			"resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-8.4.0.tgz",
+			"integrity": "sha512-sNXOfKCn74rt8RICKMvJS7XKV/Xk9kA7DyJr8mJik3S7Cwgy3qlkkmyS2uQB3jiJg6VNdZd/pDBJu0nvG2NlTg==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"dependencies": {
+				"esrecurse": "^4.3.0",
+				"estraverse": "^5.2.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/eslint-visitor-keys": {
+			"version": "4.2.1",
+			"resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz",
+			"integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/esm-env": {
+			"version": "1.2.2",
+			"resolved": "https://registry.npmjs.org/esm-env/-/esm-env-1.2.2.tgz",
+			"integrity": "sha512-Epxrv+Nr/CaL4ZcFGPJIYLWFom+YeV1DqMLHJoEd9SYRxNbaFruBwfEX/kkHUJf55j2+TUbmDcmuilbP1TmXHA==",
+			"license": "MIT"
+		},
+		"node_modules/espree": {
+			"version": "10.4.0",
+			"resolved": "https://registry.npmjs.org/espree/-/espree-10.4.0.tgz",
+			"integrity": "sha512-j6PAQ2uUr79PZhBjP5C5fhl8e39FmRnOjsD5lGnWrFU8i2G776tBK7+nP8KuQUTTyAZUwfQqXAgrVH5MbH9CYQ==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"dependencies": {
+				"acorn": "^8.15.0",
+				"acorn-jsx": "^5.3.2",
+				"eslint-visitor-keys": "^4.2.1"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/eslint"
+			}
+		},
+		"node_modules/esprima": {
+			"version": "4.0.1",
+			"resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz",
+			"integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"bin": {
+				"esparse": "bin/esparse.js",
+				"esvalidate": "bin/esvalidate.js"
+			},
+			"engines": {
+				"node": ">=4"
+			}
+		},
+		"node_modules/esquery": {
+			"version": "1.6.0",
+			"resolved": "https://registry.npmjs.org/esquery/-/esquery-1.6.0.tgz",
+			"integrity": "sha512-ca9pw9fomFcKPvFLXhBKUK90ZvGibiGOvRJNbjljY7s7uq/5YO4BOzcYtJqExdx99rF6aAcnRxHmcUHcz6sQsg==",
+			"dev": true,
+			"license": "BSD-3-Clause",
+			"dependencies": {
+				"estraverse": "^5.1.0"
+			},
+			"engines": {
+				"node": ">=0.10"
+			}
+		},
+		"node_modules/esrap": {
+			"version": "1.4.9",
+			"resolved": "https://registry.npmjs.org/esrap/-/esrap-1.4.9.tgz",
+			"integrity": "sha512-3OMlcd0a03UGuZpPeUC1HxR3nA23l+HEyCiZw3b3FumJIN9KphoGzDJKMXI1S72jVS1dsenDyQC0kJlO1U9E1g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@jridgewell/sourcemap-codec": "^1.4.15"
+			}
+		},
+		"node_modules/esrecurse": {
+			"version": "4.3.0",
+			"resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz",
+			"integrity": "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"dependencies": {
+				"estraverse": "^5.2.0"
+			},
+			"engines": {
+				"node": ">=4.0"
+			}
+		},
+		"node_modules/estraverse": {
+			"version": "5.3.0",
+			"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
+			"integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"engines": {
+				"node": ">=4.0"
+			}
+		},
+		"node_modules/esutils": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
+			"integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/eventemitter3": {
+			"version": "4.0.7",
+			"resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz",
+			"integrity": "sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/expect-type": {
+			"version": "1.2.2",
+			"resolved": "https://registry.npmjs.org/expect-type/-/expect-type-1.2.2.tgz",
+			"integrity": "sha512-JhFGDVJ7tmDJItKhYgJCGLOWjuK9vPxiXoUFLwLDc99NlmklilbiQJwoctZtt13+xMw91MCk/REan6MWHqDjyA==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">=12.0.0"
+			}
+		},
+		"node_modules/extend": {
+			"version": "3.0.2",
+			"resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
+			"integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
+			"license": "MIT"
+		},
+		"node_modules/fast-deep-equal": {
+			"version": "3.1.3",
+			"resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
+			"integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/fast-glob": {
+			"version": "3.3.3",
+			"resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz",
+			"integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@nodelib/fs.stat": "^2.0.2",
+				"@nodelib/fs.walk": "^1.2.3",
+				"glob-parent": "^5.1.2",
+				"merge2": "^1.3.0",
+				"micromatch": "^4.0.8"
+			},
+			"engines": {
+				"node": ">=8.6.0"
+			}
+		},
+		"node_modules/fast-glob/node_modules/glob-parent": {
+			"version": "5.1.2",
+			"resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
+			"integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"is-glob": "^4.0.1"
+			},
+			"engines": {
+				"node": ">= 6"
+			}
+		},
+		"node_modules/fast-json-stable-stringify": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
+			"integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/fast-levenshtein": {
+			"version": "2.0.6",
+			"resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz",
+			"integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/fastq": {
+			"version": "1.19.1",
+			"resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.1.tgz",
+			"integrity": "sha512-GwLTyxkCXjXbxqIhTsMI2Nui8huMPtnxg7krajPJAjnEG/iiOS7i+zCtWGZR9G0NBKbXKh6X9m9UIsYX/N6vvQ==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"reusify": "^1.0.4"
+			}
+		},
+		"node_modules/fdir": {
+			"version": "6.5.0",
+			"resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz",
+			"integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=12.0.0"
+			},
+			"peerDependencies": {
+				"picomatch": "^3 || ^4"
+			},
+			"peerDependenciesMeta": {
+				"picomatch": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/fflate": {
+			"version": "0.8.2",
+			"resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.2.tgz",
+			"integrity": "sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/file-entry-cache": {
+			"version": "8.0.0",
+			"resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz",
+			"integrity": "sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"flat-cache": "^4.0.0"
+			},
+			"engines": {
+				"node": ">=16.0.0"
+			}
+		},
+		"node_modules/filesize": {
+			"version": "10.1.6",
+			"resolved": "https://registry.npmjs.org/filesize/-/filesize-10.1.6.tgz",
+			"integrity": "sha512-sJslQKU2uM33qH5nqewAwVB2QgR6w1aMNsYUp3aN5rMRyXEwJGmZvaWzeJFNTOXWlHQyBFCWrdj3fV/fsTOX8w==",
+			"dev": true,
+			"license": "BSD-3-Clause",
+			"engines": {
+				"node": ">= 10.4.0"
+			}
+		},
+		"node_modules/fill-range": {
+			"version": "7.1.1",
+			"resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
+			"integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"to-regex-range": "^5.0.1"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/find-up": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz",
+			"integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"locate-path": "^6.0.0",
+				"path-exists": "^4.0.0"
+			},
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/flat-cache": {
+			"version": "4.0.1",
+			"resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-4.0.1.tgz",
+			"integrity": "sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"flatted": "^3.2.9",
+				"keyv": "^4.5.4"
+			},
+			"engines": {
+				"node": ">=16"
+			}
+		},
+		"node_modules/flatted": {
+			"version": "3.3.3",
+			"resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.3.tgz",
+			"integrity": "sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==",
+			"dev": true,
+			"license": "ISC"
+		},
+		"node_modules/follow-redirects": {
+			"version": "1.15.11",
+			"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz",
+			"integrity": "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==",
+			"dev": true,
+			"funding": [
+				{
+					"type": "individual",
+					"url": "https://github.com/sponsors/RubenVerborgh"
+				}
+			],
+			"license": "MIT",
+			"engines": {
+				"node": ">=4.0"
+			},
+			"peerDependenciesMeta": {
+				"debug": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/fsevents": {
+			"version": "2.3.2",
+			"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
+			"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
+			"dev": true,
+			"hasInstallScript": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+			}
+		},
+		"node_modules/function-bind": {
+			"version": "1.1.2",
+			"resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
+			"integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
+			"dev": true,
+			"license": "MIT",
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/get-intrinsic": {
+			"version": "1.3.0",
+			"resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
+			"integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bind-apply-helpers": "^1.0.2",
+				"es-define-property": "^1.0.1",
+				"es-errors": "^1.3.0",
+				"es-object-atoms": "^1.1.1",
+				"function-bind": "^1.1.2",
+				"get-proto": "^1.0.1",
+				"gopd": "^1.2.0",
+				"has-symbols": "^1.1.0",
+				"hasown": "^2.0.2",
+				"math-intrinsics": "^1.1.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/get-proto": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
+			"integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"dunder-proto": "^1.0.1",
+				"es-object-atoms": "^1.0.0"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/glob-parent": {
+			"version": "6.0.2",
+			"resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz",
+			"integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"is-glob": "^4.0.3"
+			},
+			"engines": {
+				"node": ">=10.13.0"
+			}
+		},
+		"node_modules/globals": {
+			"version": "16.3.0",
+			"resolved": "https://registry.npmjs.org/globals/-/globals-16.3.0.tgz",
+			"integrity": "sha512-bqWEnJ1Nt3neqx2q5SFfGS8r/ahumIakg3HcwtNlrVlwXIeNumWn/c7Pn/wKzGhf6SaW6H6uWXLqC30STCMchQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=18"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/gopd": {
+			"version": "1.2.0",
+			"resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
+			"integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/graceful-fs": {
+			"version": "4.2.11",
+			"resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
+			"integrity": "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==",
+			"dev": true,
+			"license": "ISC"
+		},
+		"node_modules/graphemer": {
+			"version": "1.4.0",
+			"resolved": "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz",
+			"integrity": "sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/has-flag": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+			"integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/has-symbols": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
+			"integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/hasown": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
+			"integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"function-bind": "^1.1.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/hast-util-from-dom": {
+			"version": "5.0.1",
+			"resolved": "https://registry.npmjs.org/hast-util-from-dom/-/hast-util-from-dom-5.0.1.tgz",
+			"integrity": "sha512-N+LqofjR2zuzTjCPzyDUdSshy4Ma6li7p/c3pA78uTwzFgENbgbUrm2ugwsOdcjI1muO+o6Dgzp9p8WHtn/39Q==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"hastscript": "^9.0.0",
+				"web-namespaces": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/hast-util-from-html": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/hast-util-from-html/-/hast-util-from-html-2.0.3.tgz",
+			"integrity": "sha512-CUSRHXyKjzHov8yKsQjGOElXy/3EKpyX56ELnkHH34vDVw1N1XSQ1ZcAvTyAPtGqLTuKP/uxM+aLkSPqF/EtMw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"devlop": "^1.1.0",
+				"hast-util-from-parse5": "^8.0.0",
+				"parse5": "^7.0.0",
+				"vfile": "^6.0.0",
+				"vfile-message": "^4.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/hast-util-from-html-isomorphic": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/hast-util-from-html-isomorphic/-/hast-util-from-html-isomorphic-2.0.0.tgz",
+			"integrity": "sha512-zJfpXq44yff2hmE0XmwEOzdWin5xwH+QIhMLOScpX91e/NSGPsAzNCvLQDIEPyO2TXi+lBmU6hjLIhV8MwP2kw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"hast-util-from-dom": "^5.0.0",
+				"hast-util-from-html": "^2.0.0",
+				"unist-util-remove-position": "^5.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/hast-util-from-html/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/hast-util-from-html/node_modules/unist-util-stringify-position": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz",
+			"integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/hast-util-from-html/node_modules/vfile-message": {
+			"version": "4.0.3",
+			"resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.3.tgz",
+			"integrity": "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0",
+				"unist-util-stringify-position": "^4.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/hast-util-from-parse5": {
+			"version": "8.0.3",
+			"resolved": "https://registry.npmjs.org/hast-util-from-parse5/-/hast-util-from-parse5-8.0.3.tgz",
+			"integrity": "sha512-3kxEVkEKt0zvcZ3hCRYI8rqrgwtlIOFMWkbclACvjlDw8Li9S2hk/d51OI0nr/gIpdMHNepwgOKqZ/sy0Clpyg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"@types/unist": "^3.0.0",
+				"devlop": "^1.0.0",
+				"hastscript": "^9.0.0",
+				"property-information": "^7.0.0",
+				"vfile": "^6.0.0",
+				"vfile-location": "^5.0.0",
+				"web-namespaces": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/hast-util-from-parse5/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/hast-util-is-element": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/hast-util-is-element/-/hast-util-is-element-3.0.0.tgz",
+			"integrity": "sha512-Val9mnv2IWpLbNPqc/pUem+a7Ipj2aHacCwgNfTiK0vJKl0LF+4Ba4+v1oPHFpf3bLYmreq0/l3Gud9S5OH42g==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/hast-util-parse-selector": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz",
+			"integrity": "sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/hast-util-sanitize": {
+			"version": "5.0.2",
+			"resolved": "https://registry.npmjs.org/hast-util-sanitize/-/hast-util-sanitize-5.0.2.tgz",
+			"integrity": "sha512-3yTWghByc50aGS7JlGhk61SPenfE/p1oaFeNwkOOyrscaOkMGrcW9+Cy/QAIOBpZxP1yqDIzFMR0+Np0i0+usg==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"@ungap/structured-clone": "^1.0.0",
+				"unist-util-position": "^5.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/hast-util-to-html": {
+			"version": "9.0.5",
+			"resolved": "https://registry.npmjs.org/hast-util-to-html/-/hast-util-to-html-9.0.5.tgz",
+			"integrity": "sha512-OguPdidb+fbHQSU4Q4ZiLKnzWo8Wwsf5bZfbvu7//a9oTYoqD/fWpe96NuHkoS9h0ccGOTe0C4NGXdtS0iObOw==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"@types/unist": "^3.0.0",
+				"ccount": "^2.0.0",
+				"comma-separated-tokens": "^2.0.0",
+				"hast-util-whitespace": "^3.0.0",
+				"html-void-elements": "^3.0.0",
+				"mdast-util-to-hast": "^13.0.0",
+				"property-information": "^7.0.0",
+				"space-separated-tokens": "^2.0.0",
+				"stringify-entities": "^4.0.0",
+				"zwitch": "^2.0.4"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/hast-util-to-html/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"license": "MIT"
+		},
+		"node_modules/hast-util-to-text": {
+			"version": "4.0.2",
+			"resolved": "https://registry.npmjs.org/hast-util-to-text/-/hast-util-to-text-4.0.2.tgz",
+			"integrity": "sha512-KK6y/BN8lbaq654j7JgBydev7wuNMcID54lkRav1P0CaE1e47P72AWWPiGKXTJU271ooYzcvTAn/Zt0REnvc7A==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"@types/unist": "^3.0.0",
+				"hast-util-is-element": "^3.0.0",
+				"unist-util-find-after": "^5.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/hast-util-to-text/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"license": "MIT"
+		},
+		"node_modules/hast-util-whitespace": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz",
+			"integrity": "sha512-88JUN06ipLwsnv+dVn+OIYOvAuvBMy/Qoi6O7mQHxdPXpjy+Cd6xRkWwux7DKO+4sYILtLBRIKgsdpS2gQc7qw==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/hastscript": {
+			"version": "9.0.1",
+			"resolved": "https://registry.npmjs.org/hastscript/-/hastscript-9.0.1.tgz",
+			"integrity": "sha512-g7df9rMFX/SPi34tyGCyUBREQoKkapwdY/T04Qn9TDWfHhAYt4/I0gMVirzK5wEzeUqIjEB+LXC/ypb7Aqno5w==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"comma-separated-tokens": "^2.0.0",
+				"hast-util-parse-selector": "^4.0.0",
+				"property-information": "^7.0.0",
+				"space-separated-tokens": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/he": {
+			"version": "1.2.0",
+			"resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz",
+			"integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==",
+			"dev": true,
+			"license": "MIT",
+			"bin": {
+				"he": "bin/he"
+			}
+		},
+		"node_modules/highlight.js": {
+			"version": "11.11.1",
+			"resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.11.1.tgz",
+			"integrity": "sha512-Xwwo44whKBVCYoliBQwaPvtd/2tYFkRQtXDWj1nackaV2JPXx3L0+Jvd8/qCJ2p+ML0/XVkJ2q+Mr+UVdpJK5w==",
+			"license": "BSD-3-Clause",
+			"engines": {
+				"node": ">=12.0.0"
+			}
+		},
+		"node_modules/html-encoding-sniffer": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-3.0.0.tgz",
+			"integrity": "sha512-oWv4T4yJ52iKrufjnyZPkrN0CH3QnrUqdB6In1g5Fe1mia8GmF36gnfNySxoZtxD5+NmYw1EElVXiBk93UeskA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"whatwg-encoding": "^2.0.0"
+			},
+			"engines": {
+				"node": ">=12"
+			}
+		},
+		"node_modules/html-void-elements": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/html-void-elements/-/html-void-elements-3.0.0.tgz",
+			"integrity": "sha512-bEqo66MRXsUGxWHV5IP0PUiAWwoEjba4VCzg0LjFJBpchPaTfyfCKTG6bc5F8ucKec3q5y6qOdGyYTSBEvhCrg==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/http-proxy": {
+			"version": "1.18.1",
+			"resolved": "https://registry.npmjs.org/http-proxy/-/http-proxy-1.18.1.tgz",
+			"integrity": "sha512-7mz/721AbnJwIVbnaSv1Cz3Am0ZLT/UBwkC92VlxhXv/k/BBQfM2fXElQNC27BVGr0uwUpplYPQM9LnaBMR5NQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"eventemitter3": "^4.0.0",
+				"follow-redirects": "^1.0.0",
+				"requires-port": "^1.0.0"
+			},
+			"engines": {
+				"node": ">=8.0.0"
+			}
+		},
+		"node_modules/http-server": {
+			"version": "14.1.1",
+			"resolved": "https://registry.npmjs.org/http-server/-/http-server-14.1.1.tgz",
+			"integrity": "sha512-+cbxadF40UXd9T01zUHgA+rlo2Bg1Srer4+B4NwIHdaGxAGGv59nYRnGGDJ9LBk7alpS0US+J+bLLdQOOkJq4A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"basic-auth": "^2.0.1",
+				"chalk": "^4.1.2",
+				"corser": "^2.0.1",
+				"he": "^1.2.0",
+				"html-encoding-sniffer": "^3.0.0",
+				"http-proxy": "^1.18.1",
+				"mime": "^1.6.0",
+				"minimist": "^1.2.6",
+				"opener": "^1.5.1",
+				"portfinder": "^1.0.28",
+				"secure-compare": "3.0.1",
+				"union": "~0.5.0",
+				"url-join": "^4.0.1"
+			},
+			"bin": {
+				"http-server": "bin/http-server"
+			},
+			"engines": {
+				"node": ">=12"
+			}
+		},
+		"node_modules/iconv-lite": {
+			"version": "0.6.3",
+			"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
+			"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"safer-buffer": ">= 2.1.2 < 3.0.0"
+			},
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/ignore": {
+			"version": "5.3.2",
+			"resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz",
+			"integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 4"
+			}
+		},
+		"node_modules/immutable": {
+			"version": "5.1.4",
+			"resolved": "https://registry.npmjs.org/immutable/-/immutable-5.1.4.tgz",
+			"integrity": "sha512-p6u1bG3YSnINT5RQmx/yRZBpenIl30kVxkTLDyHLIMk0gict704Q9n+thfDI7lTRm9vXdDYutVzXhzcThxTnXA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/import-fresh": {
+			"version": "3.3.1",
+			"resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz",
+			"integrity": "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"parent-module": "^1.0.0",
+				"resolve-from": "^4.0.0"
+			},
+			"engines": {
+				"node": ">=6"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/imurmurhash": {
+			"version": "0.1.4",
+			"resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz",
+			"integrity": "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.8.19"
+			}
+		},
+		"node_modules/indent-string": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/indent-string/-/indent-string-4.0.0.tgz",
+			"integrity": "sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/inline-style-parser": {
+			"version": "0.2.4",
+			"resolved": "https://registry.npmjs.org/inline-style-parser/-/inline-style-parser-0.2.4.tgz",
+			"integrity": "sha512-0aO8FkhNZlj/ZIbNi7Lxxr12obT7cL1moPfE4tg1LkX7LlLfC6DeX4l2ZEud1ukP9jNQyNnfzQVqwbwmAATY4Q==",
+			"license": "MIT"
+		},
+		"node_modules/is-extglob": {
+			"version": "2.1.1",
+			"resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz",
+			"integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/is-glob": {
+			"version": "4.0.3",
+			"resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
+			"integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"is-extglob": "^2.1.1"
+			},
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/is-number": {
+			"version": "7.0.0",
+			"resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
+			"integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.12.0"
+			}
+		},
+		"node_modules/is-plain-obj": {
+			"version": "4.1.0",
+			"resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz",
+			"integrity": "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==",
+			"license": "MIT",
+			"engines": {
+				"node": ">=12"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/isexe": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
+			"integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
+			"dev": true,
+			"license": "ISC"
+		},
+		"node_modules/jiti": {
+			"version": "2.4.2",
+			"resolved": "https://registry.npmjs.org/jiti/-/jiti-2.4.2.tgz",
+			"integrity": "sha512-rg9zJN+G4n2nfJl5MW3BMygZX56zKPNVEYYqq7adpmMh4Jn2QNEwhvQlFy6jPVdcod7txZtKHWnyZiA3a0zP7A==",
+			"dev": true,
+			"license": "MIT",
+			"bin": {
+				"jiti": "lib/jiti-cli.mjs"
+			}
+		},
+		"node_modules/js-tokens": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
+			"integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/js-yaml": {
+			"version": "4.1.1",
+			"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz",
+			"integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"argparse": "^2.0.1"
+			},
+			"bin": {
+				"js-yaml": "bin/js-yaml.js"
+			}
+		},
+		"node_modules/json-buffer": {
+			"version": "3.0.1",
+			"resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz",
+			"integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/json-schema-traverse": {
+			"version": "0.4.1",
+			"resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
+			"integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/json-stable-stringify-without-jsonify": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz",
+			"integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/jsonfile": {
+			"version": "6.1.0",
+			"resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.1.0.tgz",
+			"integrity": "sha512-5dgndWOriYSm5cnYaJNhalLNDKOqFwyDB/rr1E9ZsGciGvKPs8R2xYGCacuf3z6K1YKDz182fd+fY3cn3pMqXQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"universalify": "^2.0.0"
+			},
+			"optionalDependencies": {
+				"graceful-fs": "^4.1.6"
+			}
+		},
+		"node_modules/katex": {
+			"version": "0.16.22",
+			"resolved": "https://registry.npmjs.org/katex/-/katex-0.16.22.tgz",
+			"integrity": "sha512-XCHRdUw4lf3SKBaJe4EvgqIuWwkPSo9XoeO8GjQW94Bp7TWv9hNhzZjZ+OH9yf1UmLygb7DIT5GSFQiyt16zYg==",
+			"dev": true,
+			"funding": [
+				"https://opencollective.com/katex",
+				"https://github.com/sponsors/katex"
+			],
+			"license": "MIT",
+			"dependencies": {
+				"commander": "^8.3.0"
+			},
+			"bin": {
+				"katex": "cli.js"
+			}
+		},
+		"node_modules/keyv": {
+			"version": "4.5.4",
+			"resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz",
+			"integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"json-buffer": "3.0.1"
+			}
+		},
+		"node_modules/kleur": {
+			"version": "4.1.5",
+			"resolved": "https://registry.npmjs.org/kleur/-/kleur-4.1.5.tgz",
+			"integrity": "sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/known-css-properties": {
+			"version": "0.37.0",
+			"resolved": "https://registry.npmjs.org/known-css-properties/-/known-css-properties-0.37.0.tgz",
+			"integrity": "sha512-JCDrsP4Z1Sb9JwG0aJ8Eo2r7k4Ou5MwmThS/6lcIe1ICyb7UBJKGRIUUdqc2ASdE/42lgz6zFUnzAIhtXnBVrQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/levn": {
+			"version": "0.4.1",
+			"resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz",
+			"integrity": "sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"prelude-ls": "^1.2.1",
+				"type-check": "~0.4.0"
+			},
+			"engines": {
+				"node": ">= 0.8.0"
+			}
+		},
+		"node_modules/lightningcss": {
+			"version": "1.30.1",
+			"resolved": "https://registry.npmjs.org/lightningcss/-/lightningcss-1.30.1.tgz",
+			"integrity": "sha512-xi6IyHML+c9+Q3W0S4fCQJOym42pyurFiJUHEcEyHS0CeKzia4yZDEsLlqOFykxOdHpNy0NmvVO31vcSqAxJCg==",
+			"dev": true,
+			"license": "MPL-2.0",
+			"dependencies": {
+				"detect-libc": "^2.0.3"
+			},
+			"engines": {
+				"node": ">= 12.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			},
+			"optionalDependencies": {
+				"lightningcss-darwin-arm64": "1.30.1",
+				"lightningcss-darwin-x64": "1.30.1",
+				"lightningcss-freebsd-x64": "1.30.1",
+				"lightningcss-linux-arm-gnueabihf": "1.30.1",
+				"lightningcss-linux-arm64-gnu": "1.30.1",
+				"lightningcss-linux-arm64-musl": "1.30.1",
+				"lightningcss-linux-x64-gnu": "1.30.1",
+				"lightningcss-linux-x64-musl": "1.30.1",
+				"lightningcss-win32-arm64-msvc": "1.30.1",
+				"lightningcss-win32-x64-msvc": "1.30.1"
+			}
+		},
+		"node_modules/lightningcss-darwin-arm64": {
+			"version": "1.30.1",
+			"resolved": "https://registry.npmjs.org/lightningcss-darwin-arm64/-/lightningcss-darwin-arm64-1.30.1.tgz",
+			"integrity": "sha512-c8JK7hyE65X1MHMN+Viq9n11RRC7hgin3HhYKhrMyaXflk5GVplZ60IxyoVtzILeKr+xAJwg6zK6sjTBJ0FKYQ==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MPL-2.0",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": ">= 12.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/lightningcss-darwin-x64": {
+			"version": "1.30.1",
+			"resolved": "https://registry.npmjs.org/lightningcss-darwin-x64/-/lightningcss-darwin-x64-1.30.1.tgz",
+			"integrity": "sha512-k1EvjakfumAQoTfcXUcHQZhSpLlkAuEkdMBsI/ivWw9hL+7FtilQc0Cy3hrx0AAQrVtQAbMI7YjCgYgvn37PzA==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MPL-2.0",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": ">= 12.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/lightningcss-freebsd-x64": {
+			"version": "1.30.1",
+			"resolved": "https://registry.npmjs.org/lightningcss-freebsd-x64/-/lightningcss-freebsd-x64-1.30.1.tgz",
+			"integrity": "sha512-kmW6UGCGg2PcyUE59K5r0kWfKPAVy4SltVeut+umLCFoJ53RdCUWxcRDzO1eTaxf/7Q2H7LTquFHPL5R+Gjyig==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MPL-2.0",
+			"optional": true,
+			"os": [
+				"freebsd"
+			],
+			"engines": {
+				"node": ">= 12.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/lightningcss-linux-arm-gnueabihf": {
+			"version": "1.30.1",
+			"resolved": "https://registry.npmjs.org/lightningcss-linux-arm-gnueabihf/-/lightningcss-linux-arm-gnueabihf-1.30.1.tgz",
+			"integrity": "sha512-MjxUShl1v8pit+6D/zSPq9S9dQ2NPFSQwGvxBCYaBYLPlCWuPh9/t1MRS8iUaR8i+a6w7aps+B4N0S1TYP/R+Q==",
+			"cpu": [
+				"arm"
+			],
+			"dev": true,
+			"license": "MPL-2.0",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 12.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/lightningcss-linux-arm64-gnu": {
+			"version": "1.30.1",
+			"resolved": "https://registry.npmjs.org/lightningcss-linux-arm64-gnu/-/lightningcss-linux-arm64-gnu-1.30.1.tgz",
+			"integrity": "sha512-gB72maP8rmrKsnKYy8XUuXi/4OctJiuQjcuqWNlJQ6jZiWqtPvqFziskH3hnajfvKB27ynbVCucKSm2rkQp4Bw==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MPL-2.0",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 12.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/lightningcss-linux-arm64-musl": {
+			"version": "1.30.1",
+			"resolved": "https://registry.npmjs.org/lightningcss-linux-arm64-musl/-/lightningcss-linux-arm64-musl-1.30.1.tgz",
+			"integrity": "sha512-jmUQVx4331m6LIX+0wUhBbmMX7TCfjF5FoOH6SD1CttzuYlGNVpA7QnrmLxrsub43ClTINfGSYyHe2HWeLl5CQ==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MPL-2.0",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 12.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/lightningcss-linux-x64-gnu": {
+			"version": "1.30.1",
+			"resolved": "https://registry.npmjs.org/lightningcss-linux-x64-gnu/-/lightningcss-linux-x64-gnu-1.30.1.tgz",
+			"integrity": "sha512-piWx3z4wN8J8z3+O5kO74+yr6ze/dKmPnI7vLqfSqI8bccaTGY5xiSGVIJBDd5K5BHlvVLpUB3S2YCfelyJ1bw==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MPL-2.0",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 12.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/lightningcss-linux-x64-musl": {
+			"version": "1.30.1",
+			"resolved": "https://registry.npmjs.org/lightningcss-linux-x64-musl/-/lightningcss-linux-x64-musl-1.30.1.tgz",
+			"integrity": "sha512-rRomAK7eIkL+tHY0YPxbc5Dra2gXlI63HL+v1Pdi1a3sC+tJTcFrHX+E86sulgAXeI7rSzDYhPSeHHjqFhqfeQ==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MPL-2.0",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": ">= 12.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/lightningcss-win32-arm64-msvc": {
+			"version": "1.30.1",
+			"resolved": "https://registry.npmjs.org/lightningcss-win32-arm64-msvc/-/lightningcss-win32-arm64-msvc-1.30.1.tgz",
+			"integrity": "sha512-mSL4rqPi4iXq5YVqzSsJgMVFENoa4nGTT/GjO2c0Yl9OuQfPsIfncvLrEW6RbbB24WtZ3xP/2CCmI3tNkNV4oA==",
+			"cpu": [
+				"arm64"
+			],
+			"dev": true,
+			"license": "MPL-2.0",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">= 12.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/lightningcss-win32-x64-msvc": {
+			"version": "1.30.1",
+			"resolved": "https://registry.npmjs.org/lightningcss-win32-x64-msvc/-/lightningcss-win32-x64-msvc-1.30.1.tgz",
+			"integrity": "sha512-PVqXh48wh4T53F/1CCu8PIPCxLzWyCnn/9T5W1Jpmdy5h9Cwd+0YQS6/LwhHXSafuc61/xg9Lv5OrCby6a++jg==",
+			"cpu": [
+				"x64"
+			],
+			"dev": true,
+			"license": "MPL-2.0",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": ">= 12.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/parcel"
+			}
+		},
+		"node_modules/lilconfig": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz",
+			"integrity": "sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/locate-character": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/locate-character/-/locate-character-3.0.0.tgz",
+			"integrity": "sha512-SW13ws7BjaeJ6p7Q6CO2nchbYEc3X3J6WrmTTDto7yMPqVSZTUyY5Tjbid+Ab8gLnATtygYtiDIJGQRRn2ZOiA==",
+			"license": "MIT"
+		},
+		"node_modules/locate-path": {
+			"version": "6.0.0",
+			"resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz",
+			"integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"p-locate": "^5.0.0"
+			},
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/lodash": {
+			"version": "4.17.21",
+			"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
+			"integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/lodash.castarray": {
+			"version": "4.4.0",
+			"resolved": "https://registry.npmjs.org/lodash.castarray/-/lodash.castarray-4.4.0.tgz",
+			"integrity": "sha512-aVx8ztPv7/2ULbArGJ2Y42bG1mEQ5mGjpdvrbJcJFU3TbYybe+QlLS4pst9zV52ymy2in1KpFPiZnAOATxD4+Q==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/lodash.isplainobject": {
+			"version": "4.0.6",
+			"resolved": "https://registry.npmjs.org/lodash.isplainobject/-/lodash.isplainobject-4.0.6.tgz",
+			"integrity": "sha512-oSXzaWypCMHkPC3NvBEaPHf0KsA5mvPrOPgQWDsbg8n7orZ290M0BmC/jgRZ4vcJ6DTAhjrsSYgdsW/F+MFOBA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/lodash.merge": {
+			"version": "4.6.2",
+			"resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
+			"integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/longest-streak": {
+			"version": "3.1.0",
+			"resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz",
+			"integrity": "sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/loupe": {
+			"version": "3.1.4",
+			"resolved": "https://registry.npmjs.org/loupe/-/loupe-3.1.4.tgz",
+			"integrity": "sha512-wJzkKwJrheKtknCOKNEtDK4iqg/MxmZheEMtSTYvnzRdEYaZzmgH976nenp8WdJRdx5Vc1X/9MO0Oszl6ezeXg==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/lowlight": {
+			"version": "3.3.0",
+			"resolved": "https://registry.npmjs.org/lowlight/-/lowlight-3.3.0.tgz",
+			"integrity": "sha512-0JNhgFoPvP6U6lE/UdVsSq99tn6DhjjpAj5MxG49ewd2mOBVtwWYIT8ClyABhq198aXXODMU6Ox8DrGy/CpTZQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"devlop": "^1.0.0",
+				"highlight.js": "~11.11.0"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/lz-string": {
+			"version": "1.5.0",
+			"resolved": "https://registry.npmjs.org/lz-string/-/lz-string-1.5.0.tgz",
+			"integrity": "sha512-h5bgJWpxJNswbU7qCrV0tIKQCaS3blPDrqKWx+QxzuzL1zGUzij9XCWLrSLsJPu5t+eWA/ycetzYAO5IOMcWAQ==",
+			"dev": true,
+			"license": "MIT",
+			"bin": {
+				"lz-string": "bin/bin.js"
+			}
+		},
+		"node_modules/magic-string": {
+			"version": "0.30.17",
+			"resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.17.tgz",
+			"integrity": "sha512-sNPKHvyjVf7gyjwS4xGTaW/mCnF8wnjtifKBEhxfZ7E/S8tQ0rssrwGNn6q8JH/ohItJfSQp9mBtQYuTlH5QnA==",
+			"license": "MIT",
+			"dependencies": {
+				"@jridgewell/sourcemap-codec": "^1.5.0"
+			}
+		},
+		"node_modules/markdown-table": {
+			"version": "3.0.4",
+			"resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.4.tgz",
+			"integrity": "sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/math-intrinsics": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
+			"integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/mdast": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/mdast/-/mdast-3.0.0.tgz",
+			"integrity": "sha512-xySmf8g4fPKMeC07jXGz971EkLbWAJ83s4US2Tj9lEdnZ142UP5grN73H1Xd3HzrdbU5o9GYYP/y8F9ZSwLE9g==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/mdast-util-find-and-replace": {
+			"version": "3.0.2",
+			"resolved": "https://registry.npmjs.org/mdast-util-find-and-replace/-/mdast-util-find-and-replace-3.0.2.tgz",
+			"integrity": "sha512-Tmd1Vg/m3Xz43afeNxDIhWRtFZgM2VLyaf4vSTYwudTyeuTneoL3qtWMA5jeLyz/O1vDJmmV4QuScFCA2tBPwg==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"escape-string-regexp": "^5.0.0",
+				"unist-util-is": "^6.0.0",
+				"unist-util-visit-parents": "^6.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-find-and-replace/node_modules/escape-string-regexp": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz",
+			"integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==",
+			"license": "MIT",
+			"engines": {
+				"node": ">=12"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/mdast-util-from-markdown": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.2.tgz",
+			"integrity": "sha512-uZhTV/8NBuw0WHkPTrCqDOl0zVe1BIng5ZtHoDk49ME1qqcjYmmLmOf0gELgcRMxN4w2iuIeVso5/6QymSrgmA==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"@types/unist": "^3.0.0",
+				"decode-named-character-reference": "^1.0.0",
+				"devlop": "^1.0.0",
+				"mdast-util-to-string": "^4.0.0",
+				"micromark": "^4.0.0",
+				"micromark-util-decode-numeric-character-reference": "^2.0.0",
+				"micromark-util-decode-string": "^2.0.0",
+				"micromark-util-normalize-identifier": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0",
+				"unist-util-stringify-position": "^4.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-from-markdown/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"license": "MIT"
+		},
+		"node_modules/mdast-util-from-markdown/node_modules/unist-util-stringify-position": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz",
+			"integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-gfm": {
+			"version": "3.1.0",
+			"resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.1.0.tgz",
+			"integrity": "sha512-0ulfdQOM3ysHhCJ1p06l0b0VKlhU0wuQs3thxZQagjcjPrlFRqY215uZGHHJan9GEAXd9MbfPjFJz+qMkVR6zQ==",
+			"license": "MIT",
+			"dependencies": {
+				"mdast-util-from-markdown": "^2.0.0",
+				"mdast-util-gfm-autolink-literal": "^2.0.0",
+				"mdast-util-gfm-footnote": "^2.0.0",
+				"mdast-util-gfm-strikethrough": "^2.0.0",
+				"mdast-util-gfm-table": "^2.0.0",
+				"mdast-util-gfm-task-list-item": "^2.0.0",
+				"mdast-util-to-markdown": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-gfm-autolink-literal": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/mdast-util-gfm-autolink-literal/-/mdast-util-gfm-autolink-literal-2.0.1.tgz",
+			"integrity": "sha512-5HVP2MKaP6L+G6YaxPNjuL0BPrq9orG3TsrZ9YXbA3vDw/ACI4MEsnoDpn6ZNm7GnZgtAcONJyPhOP8tNJQavQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"ccount": "^2.0.0",
+				"devlop": "^1.0.0",
+				"mdast-util-find-and-replace": "^3.0.0",
+				"micromark-util-character": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-gfm-footnote": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/mdast-util-gfm-footnote/-/mdast-util-gfm-footnote-2.1.0.tgz",
+			"integrity": "sha512-sqpDWlsHn7Ac9GNZQMeUzPQSMzR6Wv0WKRNvQRg0KqHh02fpTz69Qc1QSseNX29bhz1ROIyNyxExfawVKTm1GQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"devlop": "^1.1.0",
+				"mdast-util-from-markdown": "^2.0.0",
+				"mdast-util-to-markdown": "^2.0.0",
+				"micromark-util-normalize-identifier": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-gfm-strikethrough": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/mdast-util-gfm-strikethrough/-/mdast-util-gfm-strikethrough-2.0.0.tgz",
+			"integrity": "sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"mdast-util-from-markdown": "^2.0.0",
+				"mdast-util-to-markdown": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-gfm-table": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/mdast-util-gfm-table/-/mdast-util-gfm-table-2.0.0.tgz",
+			"integrity": "sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"devlop": "^1.0.0",
+				"markdown-table": "^3.0.0",
+				"mdast-util-from-markdown": "^2.0.0",
+				"mdast-util-to-markdown": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-gfm-task-list-item": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/mdast-util-gfm-task-list-item/-/mdast-util-gfm-task-list-item-2.0.0.tgz",
+			"integrity": "sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"devlop": "^1.0.0",
+				"mdast-util-from-markdown": "^2.0.0",
+				"mdast-util-to-markdown": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-math": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/mdast-util-math/-/mdast-util-math-3.0.0.tgz",
+			"integrity": "sha512-Tl9GBNeG/AhJnQM221bJR2HPvLOSnLE/T9cJI9tlc6zwQk2nPk/4f0cHkOdEixQPC/j8UtKDdITswvLAy1OZ1w==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"@types/mdast": "^4.0.0",
+				"devlop": "^1.0.0",
+				"longest-streak": "^3.0.0",
+				"mdast-util-from-markdown": "^2.0.0",
+				"mdast-util-to-markdown": "^2.1.0",
+				"unist-util-remove-position": "^5.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-newline-to-break": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/mdast-util-newline-to-break/-/mdast-util-newline-to-break-2.0.0.tgz",
+			"integrity": "sha512-MbgeFca0hLYIEx/2zGsszCSEJJ1JSCdiY5xQxRcLDDGa8EPvlLPupJ4DSajbMPAnC0je8jfb9TiUATnxxrHUog==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"mdast-util-find-and-replace": "^3.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-phrasing": {
+			"version": "4.1.0",
+			"resolved": "https://registry.npmjs.org/mdast-util-phrasing/-/mdast-util-phrasing-4.1.0.tgz",
+			"integrity": "sha512-TqICwyvJJpBwvGAMZjj4J2n0X8QWp21b9l0o7eXyVJ25YNWYbJDVIyD1bZXE6WtV6RmKJVYmQAKWa0zWOABz2w==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"unist-util-is": "^6.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-to-hast": {
+			"version": "13.2.1",
+			"resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.1.tgz",
+			"integrity": "sha512-cctsq2wp5vTsLIcaymblUriiTcZd0CwWtCbLvrOzYCDZoWyMNV8sZ7krj09FSnsiJi3WVsHLM4k6Dq/yaPyCXA==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"@types/mdast": "^4.0.0",
+				"@ungap/structured-clone": "^1.0.0",
+				"devlop": "^1.0.0",
+				"micromark-util-sanitize-uri": "^2.0.0",
+				"trim-lines": "^3.0.0",
+				"unist-util-position": "^5.0.0",
+				"unist-util-visit": "^5.0.0",
+				"vfile": "^6.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-to-markdown": {
+			"version": "2.1.2",
+			"resolved": "https://registry.npmjs.org/mdast-util-to-markdown/-/mdast-util-to-markdown-2.1.2.tgz",
+			"integrity": "sha512-xj68wMTvGXVOKonmog6LwyJKrYXZPvlwabaryTjLh9LuvovB/KAH+kvi8Gjj+7rJjsFi23nkUxRQv1KqSroMqA==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"@types/unist": "^3.0.0",
+				"longest-streak": "^3.0.0",
+				"mdast-util-phrasing": "^4.0.0",
+				"mdast-util-to-string": "^4.0.0",
+				"micromark-util-classify-character": "^2.0.0",
+				"micromark-util-decode-string": "^2.0.0",
+				"unist-util-visit": "^5.0.0",
+				"zwitch": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdast-util-to-markdown/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"license": "MIT"
+		},
+		"node_modules/mdast-util-to-string": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-4.0.0.tgz",
+			"integrity": "sha512-0H44vDimn51F0YwvxSJSm0eCDOJTRlmN0R1yBh4HLj9wiV1Dn0QoXGbvFAWj2hSItVTlCmBF1hqKlIyUBVFLPg==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdsvex": {
+			"version": "0.12.6",
+			"resolved": "https://registry.npmjs.org/mdsvex/-/mdsvex-0.12.6.tgz",
+			"integrity": "sha512-pupx2gzWh3hDtm/iDW4WuCpljmyHbHi34r7ktOqpPGvyiM4MyfNgdJ3qMizXdgCErmvYC9Nn/qyjePy+4ss9Wg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.4",
+				"@types/unist": "^2.0.3",
+				"prism-svelte": "^0.4.7",
+				"prismjs": "^1.17.1",
+				"unist-util-visit": "^2.0.1",
+				"vfile-message": "^2.0.4"
+			},
+			"peerDependencies": {
+				"svelte": "^3.56.0 || ^4.0.0 || ^5.0.0-next.120"
+			}
+		},
+		"node_modules/mdsvex/node_modules/unist-util-is": {
+			"version": "4.1.0",
+			"resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-4.1.0.tgz",
+			"integrity": "sha512-ZOQSsnce92GrxSqlnEEseX0gi7GH9zTJZ0p9dtu87WRb/37mMPO2Ilx1s/t9vBHrFhbgweUwb+t7cIn5dxPhZg==",
+			"dev": true,
+			"license": "MIT",
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdsvex/node_modules/unist-util-visit": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-2.0.3.tgz",
+			"integrity": "sha512-iJ4/RczbJMkD0712mGktuGpm/U4By4FfDonL7N/9tATGIF4imikjOuagyMY53tnZq3NP6BcmlrHhEKAfGWjh7Q==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^2.0.0",
+				"unist-util-is": "^4.0.0",
+				"unist-util-visit-parents": "^3.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/mdsvex/node_modules/unist-util-visit-parents": {
+			"version": "3.1.1",
+			"resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-3.1.1.tgz",
+			"integrity": "sha512-1KROIZWo6bcMrZEwiH2UrXDyalAa0uqzWCxCJj6lPOvTve2WkfgCytoDTPaMnodXh1WrXOq0haVYHj99ynJlsg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^2.0.0",
+				"unist-util-is": "^4.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/merge2": {
+			"version": "1.4.1",
+			"resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz",
+			"integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 8"
+			}
+		},
+		"node_modules/micromark": {
+			"version": "4.0.2",
+			"resolved": "https://registry.npmjs.org/micromark/-/micromark-4.0.2.tgz",
+			"integrity": "sha512-zpe98Q6kvavpCr1NPVSCMebCKfD7CA2NqZ+rykeNhONIJBpc1tFKt9hucLGwha3jNTNI8lHpctWJWoimVF4PfA==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"@types/debug": "^4.0.0",
+				"debug": "^4.0.0",
+				"decode-named-character-reference": "^1.0.0",
+				"devlop": "^1.0.0",
+				"micromark-core-commonmark": "^2.0.0",
+				"micromark-factory-space": "^2.0.0",
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-chunked": "^2.0.0",
+				"micromark-util-combine-extensions": "^2.0.0",
+				"micromark-util-decode-numeric-character-reference": "^2.0.0",
+				"micromark-util-encode": "^2.0.0",
+				"micromark-util-normalize-identifier": "^2.0.0",
+				"micromark-util-resolve-all": "^2.0.0",
+				"micromark-util-sanitize-uri": "^2.0.0",
+				"micromark-util-subtokenize": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-core-commonmark": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/micromark-core-commonmark/-/micromark-core-commonmark-2.0.3.tgz",
+			"integrity": "sha512-RDBrHEMSxVFLg6xvnXmb1Ayr2WzLAWjeSATAoxwKYJV94TeNavgoIdA0a9ytzDSVzBy2YKFK+emCPOEibLeCrg==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"decode-named-character-reference": "^1.0.0",
+				"devlop": "^1.0.0",
+				"micromark-factory-destination": "^2.0.0",
+				"micromark-factory-label": "^2.0.0",
+				"micromark-factory-space": "^2.0.0",
+				"micromark-factory-title": "^2.0.0",
+				"micromark-factory-whitespace": "^2.0.0",
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-chunked": "^2.0.0",
+				"micromark-util-classify-character": "^2.0.0",
+				"micromark-util-html-tag-name": "^2.0.0",
+				"micromark-util-normalize-identifier": "^2.0.0",
+				"micromark-util-resolve-all": "^2.0.0",
+				"micromark-util-subtokenize": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-extension-gfm": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/micromark-extension-gfm/-/micromark-extension-gfm-3.0.0.tgz",
+			"integrity": "sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w==",
+			"license": "MIT",
+			"dependencies": {
+				"micromark-extension-gfm-autolink-literal": "^2.0.0",
+				"micromark-extension-gfm-footnote": "^2.0.0",
+				"micromark-extension-gfm-strikethrough": "^2.0.0",
+				"micromark-extension-gfm-table": "^2.0.0",
+				"micromark-extension-gfm-tagfilter": "^2.0.0",
+				"micromark-extension-gfm-task-list-item": "^2.0.0",
+				"micromark-util-combine-extensions": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/micromark-extension-gfm-autolink-literal": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/micromark-extension-gfm-autolink-literal/-/micromark-extension-gfm-autolink-literal-2.1.0.tgz",
+			"integrity": "sha512-oOg7knzhicgQ3t4QCjCWgTmfNhvQbDDnJeVu9v81r7NltNCVmhPy1fJRX27pISafdjL+SVc4d3l48Gb6pbRypw==",
+			"license": "MIT",
+			"dependencies": {
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-sanitize-uri": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/micromark-extension-gfm-footnote": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/micromark-extension-gfm-footnote/-/micromark-extension-gfm-footnote-2.1.0.tgz",
+			"integrity": "sha512-/yPhxI1ntnDNsiHtzLKYnE3vf9JZ6cAisqVDauhp4CEHxlb4uoOTxOCJ+9s51bIB8U1N1FJ1RXOKTIlD5B/gqw==",
+			"license": "MIT",
+			"dependencies": {
+				"devlop": "^1.0.0",
+				"micromark-core-commonmark": "^2.0.0",
+				"micromark-factory-space": "^2.0.0",
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-normalize-identifier": "^2.0.0",
+				"micromark-util-sanitize-uri": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/micromark-extension-gfm-strikethrough": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/micromark-extension-gfm-strikethrough/-/micromark-extension-gfm-strikethrough-2.1.0.tgz",
+			"integrity": "sha512-ADVjpOOkjz1hhkZLlBiYA9cR2Anf8F4HqZUO6e5eDcPQd0Txw5fxLzzxnEkSkfnD0wziSGiv7sYhk/ktvbf1uw==",
+			"license": "MIT",
+			"dependencies": {
+				"devlop": "^1.0.0",
+				"micromark-util-chunked": "^2.0.0",
+				"micromark-util-classify-character": "^2.0.0",
+				"micromark-util-resolve-all": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/micromark-extension-gfm-table": {
+			"version": "2.1.1",
+			"resolved": "https://registry.npmjs.org/micromark-extension-gfm-table/-/micromark-extension-gfm-table-2.1.1.tgz",
+			"integrity": "sha512-t2OU/dXXioARrC6yWfJ4hqB7rct14e8f7m0cbI5hUmDyyIlwv5vEtooptH8INkbLzOatzKuVbQmAYcbWoyz6Dg==",
+			"license": "MIT",
+			"dependencies": {
+				"devlop": "^1.0.0",
+				"micromark-factory-space": "^2.0.0",
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/micromark-extension-gfm-tagfilter": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/micromark-extension-gfm-tagfilter/-/micromark-extension-gfm-tagfilter-2.0.0.tgz",
+			"integrity": "sha512-xHlTOmuCSotIA8TW1mDIM6X2O1SiX5P9IuDtqGonFhEK0qgRI4yeC6vMxEV2dgyr2TiD+2PQ10o+cOhdVAcwfg==",
+			"license": "MIT",
+			"dependencies": {
+				"micromark-util-types": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/micromark-extension-gfm-task-list-item": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/micromark-extension-gfm-task-list-item/-/micromark-extension-gfm-task-list-item-2.1.0.tgz",
+			"integrity": "sha512-qIBZhqxqI6fjLDYFTBIa4eivDMnP+OZqsNwmQ3xNLE4Cxwc+zfQEfbs6tzAo2Hjq+bh6q5F+Z8/cksrLFYWQQw==",
+			"license": "MIT",
+			"dependencies": {
+				"devlop": "^1.0.0",
+				"micromark-factory-space": "^2.0.0",
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/micromark-extension-math": {
+			"version": "3.1.0",
+			"resolved": "https://registry.npmjs.org/micromark-extension-math/-/micromark-extension-math-3.1.0.tgz",
+			"integrity": "sha512-lvEqd+fHjATVs+2v/8kg9i5Q0AP2k85H0WUOwpIVvUML8BapsMvh1XAogmQjOCsLpoKRCVQqEkQBB3NhVBcsOg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/katex": "^0.16.0",
+				"devlop": "^1.0.0",
+				"katex": "^0.16.0",
+				"micromark-factory-space": "^2.0.0",
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/micromark-factory-destination": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-2.0.1.tgz",
+			"integrity": "sha512-Xe6rDdJlkmbFRExpTOmRj9N3MaWmbAgdpSrBQvCFqhezUn4AHqJHbaEnfbVYYiexVSs//tqOdY/DxhjdCiJnIA==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-factory-label": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-factory-label/-/micromark-factory-label-2.0.1.tgz",
+			"integrity": "sha512-VFMekyQExqIW7xIChcXn4ok29YE3rnuyveW3wZQWWqF4Nv9Wk5rgJ99KzPvHjkmPXF93FXIbBp6YdW3t71/7Vg==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"devlop": "^1.0.0",
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-factory-space": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-factory-space/-/micromark-factory-space-2.0.1.tgz",
+			"integrity": "sha512-zRkxjtBxxLd2Sc0d+fbnEunsTj46SWXgXciZmHq0kDYGnck/ZSGj9/wULTV95uoeYiK5hRXP2mJ98Uo4cq/LQg==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-factory-title": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-factory-title/-/micromark-factory-title-2.0.1.tgz",
+			"integrity": "sha512-5bZ+3CjhAd9eChYTHsjy6TGxpOFSKgKKJPJxr293jTbfry2KDoWkhBb6TcPVB4NmzaPhMs1Frm9AZH7OD4Cjzw==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"micromark-factory-space": "^2.0.0",
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-factory-whitespace": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-factory-whitespace/-/micromark-factory-whitespace-2.0.1.tgz",
+			"integrity": "sha512-Ob0nuZ3PKt/n0hORHyvoD9uZhr+Za8sFoP+OnMcnWK5lngSzALgQYKMr9RJVOWLqQYuyn6ulqGWSXdwf6F80lQ==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"micromark-factory-space": "^2.0.0",
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-util-character": {
+			"version": "2.1.1",
+			"resolved": "https://registry.npmjs.org/micromark-util-character/-/micromark-util-character-2.1.1.tgz",
+			"integrity": "sha512-wv8tdUTJ3thSFFFJKtpYKOYiGP2+v96Hvk4Tu8KpCAsTMs6yi+nVmGh1syvSCsaxz45J6Jbw+9DD6g97+NV67Q==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-util-chunked": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-util-chunked/-/micromark-util-chunked-2.0.1.tgz",
+			"integrity": "sha512-QUNFEOPELfmvv+4xiNg2sRYeS/P84pTW0TCgP5zc9FpXetHY0ab7SxKyAQCNCc1eK0459uoLI1y5oO5Vc1dbhA==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"micromark-util-symbol": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-util-classify-character": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-util-classify-character/-/micromark-util-classify-character-2.0.1.tgz",
+			"integrity": "sha512-K0kHzM6afW/MbeWYWLjoHQv1sgg2Q9EccHEDzSkxiP/EaagNzCm7T/WMKZ3rjMbvIpvBiZgwR3dKMygtA4mG1Q==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-util-combine-extensions": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-util-combine-extensions/-/micromark-util-combine-extensions-2.0.1.tgz",
+			"integrity": "sha512-OnAnH8Ujmy59JcyZw8JSbK9cGpdVY44NKgSM7E9Eh7DiLS2E9RNQf0dONaGDzEG9yjEl5hcqeIsj4hfRkLH/Bg==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"micromark-util-chunked": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-util-decode-numeric-character-reference": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/micromark-util-decode-numeric-character-reference/-/micromark-util-decode-numeric-character-reference-2.0.2.tgz",
+			"integrity": "sha512-ccUbYk6CwVdkmCQMyr64dXz42EfHGkPQlBj5p7YVGzq8I7CtjXZJrubAYezf7Rp+bjPseiROqe7G6foFd+lEuw==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"micromark-util-symbol": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-util-decode-string": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-util-decode-string/-/micromark-util-decode-string-2.0.1.tgz",
+			"integrity": "sha512-nDV/77Fj6eH1ynwscYTOsbK7rR//Uj0bZXBwJZRfaLEJ1iGBR6kIfNmlNqaqJf649EP0F3NWNdeJi03elllNUQ==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"decode-named-character-reference": "^1.0.0",
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-decode-numeric-character-reference": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-util-encode": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-util-encode/-/micromark-util-encode-2.0.1.tgz",
+			"integrity": "sha512-c3cVx2y4KqUnwopcO9b/SCdo2O67LwJJ/UyqGfbigahfegL9myoEFoDYZgkT7f36T0bLrM9hZTAaAyH+PCAXjw==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT"
+		},
+		"node_modules/micromark-util-html-tag-name": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-util-html-tag-name/-/micromark-util-html-tag-name-2.0.1.tgz",
+			"integrity": "sha512-2cNEiYDhCWKI+Gs9T0Tiysk136SnR13hhO8yW6BGNyhOC4qYFnwF1nKfD3HFAIXA5c45RrIG1ub11GiXeYd1xA==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT"
+		},
+		"node_modules/micromark-util-normalize-identifier": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-util-normalize-identifier/-/micromark-util-normalize-identifier-2.0.1.tgz",
+			"integrity": "sha512-sxPqmo70LyARJs0w2UclACPUUEqltCkJ6PhKdMIDuJ3gSf/Q+/GIe3WKl0Ijb/GyH9lOpUkRAO2wp0GVkLvS9Q==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"micromark-util-symbol": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-util-resolve-all": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-util-resolve-all/-/micromark-util-resolve-all-2.0.1.tgz",
+			"integrity": "sha512-VdQyxFWFT2/FGJgwQnJYbe1jjQoNTS4RjglmSjTUlpUMa95Htx9NHeYW4rGDJzbjvCsl9eLjMQwGeElsqmzcHg==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"micromark-util-types": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-util-sanitize-uri": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-util-sanitize-uri/-/micromark-util-sanitize-uri-2.0.1.tgz",
+			"integrity": "sha512-9N9IomZ/YuGGZZmQec1MbgxtlgougxTodVwDzzEouPKo3qFWvymFHWcnDi2vzV1ff6kas9ucW+o3yzJK9YB1AQ==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"micromark-util-character": "^2.0.0",
+				"micromark-util-encode": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-util-subtokenize": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/micromark-util-subtokenize/-/micromark-util-subtokenize-2.1.0.tgz",
+			"integrity": "sha512-XQLu552iSctvnEcgXw6+Sx75GflAPNED1qx7eBJ+wydBb2KCbRZe+NwvIEEMM83uml1+2WSXpBAcp9IUCgCYWA==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"devlop": "^1.0.0",
+				"micromark-util-chunked": "^2.0.0",
+				"micromark-util-symbol": "^2.0.0",
+				"micromark-util-types": "^2.0.0"
+			}
+		},
+		"node_modules/micromark-util-symbol": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/micromark-util-symbol/-/micromark-util-symbol-2.0.1.tgz",
+			"integrity": "sha512-vs5t8Apaud9N28kgCrRUdEed4UJ+wWNvicHLPxCa9ENlYuAY31M0ETy5y1vA33YoNPDFTghEbnh6efaE8h4x0Q==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT"
+		},
+		"node_modules/micromark-util-types": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/micromark-util-types/-/micromark-util-types-2.0.2.tgz",
+			"integrity": "sha512-Yw0ECSpJoViF1qTU4DC6NwtC4aWGt1EkzaQB8KPPyCRR8z9TWeV0HbEFGTO+ZY1wB22zmxnJqhPyTpOVCpeHTA==",
+			"funding": [
+				{
+					"type": "GitHub Sponsors",
+					"url": "https://github.com/sponsors/unifiedjs"
+				},
+				{
+					"type": "OpenCollective",
+					"url": "https://opencollective.com/unified"
+				}
+			],
+			"license": "MIT"
+		},
+		"node_modules/micromatch": {
+			"version": "4.0.8",
+			"resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz",
+			"integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"braces": "^3.0.3",
+				"picomatch": "^2.3.1"
+			},
+			"engines": {
+				"node": ">=8.6"
+			}
+		},
+		"node_modules/micromatch/node_modules/picomatch": {
+			"version": "2.3.1",
+			"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
+			"integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8.6"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/jonschlinkert"
+			}
+		},
+		"node_modules/mime": {
+			"version": "1.6.0",
+			"resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
+			"integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==",
+			"dev": true,
+			"license": "MIT",
+			"bin": {
+				"mime": "cli.js"
+			},
+			"engines": {
+				"node": ">=4"
+			}
+		},
+		"node_modules/min-indent": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/min-indent/-/min-indent-1.0.1.tgz",
+			"integrity": "sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=4"
+			}
+		},
+		"node_modules/mini-svg-data-uri": {
+			"version": "1.4.4",
+			"resolved": "https://registry.npmjs.org/mini-svg-data-uri/-/mini-svg-data-uri-1.4.4.tgz",
+			"integrity": "sha512-r9deDe9p5FJUPZAk3A59wGH7Ii9YrjjWw0jmw/liSbHl2CHiyXj6FcDXDu2K3TjVAXqiJdaw3xxwlZZr9E6nHg==",
+			"dev": true,
+			"license": "MIT",
+			"bin": {
+				"mini-svg-data-uri": "cli.js"
+			}
+		},
+		"node_modules/minimatch": {
+			"version": "3.1.2",
+			"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
+			"integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"brace-expansion": "^1.1.7"
+			},
+			"engines": {
+				"node": "*"
+			}
+		},
+		"node_modules/minimist": {
+			"version": "1.2.8",
+			"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
+			"integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
+			"dev": true,
+			"license": "MIT",
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/minipass": {
+			"version": "7.1.2",
+			"resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz",
+			"integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==",
+			"dev": true,
+			"license": "ISC",
+			"engines": {
+				"node": ">=16 || 14 >=14.17"
+			}
+		},
+		"node_modules/minizlib": {
+			"version": "3.0.2",
+			"resolved": "https://registry.npmjs.org/minizlib/-/minizlib-3.0.2.tgz",
+			"integrity": "sha512-oG62iEk+CYt5Xj2YqI5Xi9xWUeZhDI8jjQmC5oThVH5JGCTgIjr7ciJDzC7MBzYd//WvR1OTmP5Q38Q8ShQtVA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"minipass": "^7.1.2"
+			},
+			"engines": {
+				"node": ">= 18"
+			}
+		},
+		"node_modules/mkdirp": {
+			"version": "3.0.1",
+			"resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-3.0.1.tgz",
+			"integrity": "sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==",
+			"dev": true,
+			"license": "MIT",
+			"bin": {
+				"mkdirp": "dist/cjs/src/bin.js"
+			},
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/isaacs"
+			}
+		},
+		"node_modules/mode-watcher": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/mode-watcher/-/mode-watcher-1.1.0.tgz",
+			"integrity": "sha512-mUT9RRGPDYenk59qJauN1rhsIMKBmWA3xMF+uRwE8MW/tjhaDSCCARqkSuDTq8vr4/2KcAxIGVjACxTjdk5C3g==",
+			"license": "MIT",
+			"dependencies": {
+				"runed": "^0.25.0",
+				"svelte-toolbelt": "^0.7.1"
+			},
+			"peerDependencies": {
+				"svelte": "^5.27.0"
+			}
+		},
+		"node_modules/mri": {
+			"version": "1.2.0",
+			"resolved": "https://registry.npmjs.org/mri/-/mri-1.2.0.tgz",
+			"integrity": "sha512-tzzskb3bG8LvYGFF/mDTpq3jpI6Q9wc3LEmBaghu+DdCssd1FakN7Bc0hVNmEyGq1bq3RgfkCb3cmQLpNPOroA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=4"
+			}
+		},
+		"node_modules/mrmime": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/mrmime/-/mrmime-2.0.1.tgz",
+			"integrity": "sha512-Y3wQdFg2Va6etvQ5I82yUhGdsKrcYox6p7FfL1LbK2J4V01F9TGlepTIhnK24t7koZibmg82KGglhA1XK5IsLQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/ms": {
+			"version": "2.1.3",
+			"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+			"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+			"license": "MIT"
+		},
+		"node_modules/nanoid": {
+			"version": "3.3.11",
+			"resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz",
+			"integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==",
+			"dev": true,
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/ai"
+				}
+			],
+			"license": "MIT",
+			"bin": {
+				"nanoid": "bin/nanoid.cjs"
+			},
+			"engines": {
+				"node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
+			}
+		},
+		"node_modules/natural-compare": {
+			"version": "1.4.0",
+			"resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz",
+			"integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/node-addon-api": {
+			"version": "7.1.1",
+			"resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-7.1.1.tgz",
+			"integrity": "sha512-5m3bsyrjFWE1xf7nz7YXdN4udnVtXK6/Yfgn5qnahL6bCkf2yKt4k3nuTKAtT4r3IG8JNR2ncsIMdZuAzJjHQQ==",
+			"dev": true,
+			"license": "MIT",
+			"optional": true
+		},
+		"node_modules/object-inspect": {
+			"version": "1.13.4",
+			"resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz",
+			"integrity": "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/opener": {
+			"version": "1.5.2",
+			"resolved": "https://registry.npmjs.org/opener/-/opener-1.5.2.tgz",
+			"integrity": "sha512-ur5UIdyw5Y7yEj9wLzhqXiy6GZ3Mwx0yGI+5sMn2r0N0v3cKJvUmFH5yPP+WXh9e0xfyzyJX95D8l088DNFj7A==",
+			"dev": true,
+			"license": "(WTFPL OR MIT)",
+			"bin": {
+				"opener": "bin/opener-bin.js"
+			}
+		},
+		"node_modules/optionator": {
+			"version": "0.9.4",
+			"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
+			"integrity": "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"deep-is": "^0.1.3",
+				"fast-levenshtein": "^2.0.6",
+				"levn": "^0.4.1",
+				"prelude-ls": "^1.2.1",
+				"type-check": "^0.4.0",
+				"word-wrap": "^1.2.5"
+			},
+			"engines": {
+				"node": ">= 0.8.0"
+			}
+		},
+		"node_modules/p-limit": {
+			"version": "3.1.0",
+			"resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz",
+			"integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"yocto-queue": "^0.1.0"
+			},
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/p-locate": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz",
+			"integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"p-limit": "^3.0.2"
+			},
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/parent-module": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
+			"integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"callsites": "^3.0.0"
+			},
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/parse5": {
+			"version": "7.3.0",
+			"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz",
+			"integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"entities": "^6.0.0"
+			},
+			"funding": {
+				"url": "https://github.com/inikulin/parse5?sponsor=1"
+			}
+		},
+		"node_modules/path-exists": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz",
+			"integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/path-key": {
+			"version": "3.1.1",
+			"resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
+			"integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/pathe": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/pathe/-/pathe-2.0.3.tgz",
+			"integrity": "sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/pathval": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/pathval/-/pathval-2.0.1.tgz",
+			"integrity": "sha512-//nshmD55c46FuFw26xV/xFAaB5HF9Xdap7HJBBnrKdAd6/GxDBaNA1870O79+9ueg61cZLSVc+OaFlfmObYVQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 14.16"
+			}
+		},
+		"node_modules/pdfjs-dist": {
+			"version": "5.4.54",
+			"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.54.tgz",
+			"integrity": "sha512-TBAiTfQw89gU/Z4LW98Vahzd2/LoCFprVGvGbTgFt+QCB1F+woyOPmNNVgLa6djX9Z9GGTnj7qE1UzpOVJiINw==",
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">=20.16.0 || >=22.3.0"
+			},
+			"optionalDependencies": {
+				"@napi-rs/canvas": "^0.1.74"
+			}
+		},
+		"node_modules/picocolors": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
+			"integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
+			"dev": true,
+			"license": "ISC"
+		},
+		"node_modules/picomatch": {
+			"version": "4.0.3",
+			"resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz",
+			"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=12"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/jonschlinkert"
+			}
+		},
+		"node_modules/playwright": {
+			"version": "1.56.1",
+			"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.56.1.tgz",
+			"integrity": "sha512-aFi5B0WovBHTEvpM3DzXTUaeN6eN0qWnTkKx4NQaH4Wvcmc153PdaY2UBdSYKaGYw+UyWXSVyxDUg5DoPEttjw==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"dependencies": {
+				"playwright-core": "1.56.1"
+			},
+			"bin": {
+				"playwright": "cli.js"
+			},
+			"engines": {
+				"node": ">=18"
+			},
+			"optionalDependencies": {
+				"fsevents": "2.3.2"
+			}
+		},
+		"node_modules/playwright-core": {
+			"version": "1.56.1",
+			"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.56.1.tgz",
+			"integrity": "sha512-hutraynyn31F+Bifme+Ps9Vq59hKuUCz7H1kDOcBs+2oGguKkWTU50bBWrtz34OUWmIwpBTWDxaRPXrIXkgvmQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"bin": {
+				"playwright-core": "cli.js"
+			},
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/portfinder": {
+			"version": "1.0.38",
+			"resolved": "https://registry.npmjs.org/portfinder/-/portfinder-1.0.38.tgz",
+			"integrity": "sha512-rEwq/ZHlJIKw++XtLAO8PPuOQA/zaPJOZJ37BVuN97nLpMJeuDVLVGRwbFoBgLudgdTMP2hdRJP++H+8QOA3vg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"async": "^3.2.6",
+				"debug": "^4.3.6"
+			},
+			"engines": {
+				"node": ">= 10.12"
+			}
+		},
+		"node_modules/postcss": {
+			"version": "8.5.6",
+			"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz",
+			"integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==",
+			"dev": true,
+			"funding": [
+				{
+					"type": "opencollective",
+					"url": "https://opencollective.com/postcss/"
+				},
+				{
+					"type": "tidelift",
+					"url": "https://tidelift.com/funding/github/npm/postcss"
+				},
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/ai"
+				}
+			],
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"nanoid": "^3.3.11",
+				"picocolors": "^1.1.1",
+				"source-map-js": "^1.2.1"
+			},
+			"engines": {
+				"node": "^10 || ^12 || >=14"
+			}
+		},
+		"node_modules/postcss-load-config": {
+			"version": "3.1.4",
+			"resolved": "https://registry.npmjs.org/postcss-load-config/-/postcss-load-config-3.1.4.tgz",
+			"integrity": "sha512-6DiM4E7v4coTE4uzA8U//WhtPwyhiim3eyjEMFCnUpzbrkK9wJHgKDT2mR+HbtSrd/NubVaYTOpSpjUl8NQeRg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"lilconfig": "^2.0.5",
+				"yaml": "^1.10.2"
+			},
+			"engines": {
+				"node": ">= 10"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/postcss/"
+			},
+			"peerDependencies": {
+				"postcss": ">=8.0.9",
+				"ts-node": ">=9.0.0"
+			},
+			"peerDependenciesMeta": {
+				"postcss": {
+					"optional": true
+				},
+				"ts-node": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/postcss-load-config/node_modules/yaml": {
+			"version": "1.10.2",
+			"resolved": "https://registry.npmjs.org/yaml/-/yaml-1.10.2.tgz",
+			"integrity": "sha512-r3vXyErRCYJ7wg28yvBY5VSoAF8ZvlcW9/BwUzEtUsjvX/DKs24dIkuwjtuprwJJHsbyUbLApepYTR1BN4uHrg==",
+			"dev": true,
+			"license": "ISC",
+			"engines": {
+				"node": ">= 6"
+			}
+		},
+		"node_modules/postcss-safe-parser": {
+			"version": "7.0.1",
+			"resolved": "https://registry.npmjs.org/postcss-safe-parser/-/postcss-safe-parser-7.0.1.tgz",
+			"integrity": "sha512-0AioNCJZ2DPYz5ABT6bddIqlhgwhpHZ/l65YAYo0BCIn0xiDpsnTHz0gnoTGk0OXZW0JRs+cDwL8u/teRdz+8A==",
+			"dev": true,
+			"funding": [
+				{
+					"type": "opencollective",
+					"url": "https://opencollective.com/postcss/"
+				},
+				{
+					"type": "tidelift",
+					"url": "https://tidelift.com/funding/github/npm/postcss-safe-parser"
+				},
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/ai"
+				}
+			],
+			"license": "MIT",
+			"engines": {
+				"node": ">=18.0"
+			},
+			"peerDependencies": {
+				"postcss": "^8.4.31"
+			}
+		},
+		"node_modules/postcss-scss": {
+			"version": "4.0.9",
+			"resolved": "https://registry.npmjs.org/postcss-scss/-/postcss-scss-4.0.9.tgz",
+			"integrity": "sha512-AjKOeiwAitL/MXxQW2DliT28EKukvvbEWx3LBmJIRN8KfBGZbRTxNYW0kSqi1COiTZ57nZ9NW06S6ux//N1c9A==",
+			"dev": true,
+			"funding": [
+				{
+					"type": "opencollective",
+					"url": "https://opencollective.com/postcss/"
+				},
+				{
+					"type": "tidelift",
+					"url": "https://tidelift.com/funding/github/npm/postcss-scss"
+				},
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/ai"
+				}
+			],
+			"license": "MIT",
+			"engines": {
+				"node": ">=12.0"
+			},
+			"peerDependencies": {
+				"postcss": "^8.4.29"
+			}
+		},
+		"node_modules/postcss-selector-parser": {
+			"version": "6.0.10",
+			"resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.0.10.tgz",
+			"integrity": "sha512-IQ7TZdoaqbT+LCpShg46jnZVlhWD2w6iQYAcYXfHARZ7X1t/UGhhceQDs5X0cGqKvYlHNOuv7Oa1xmb0oQuA3w==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"cssesc": "^3.0.0",
+				"util-deprecate": "^1.0.2"
+			},
+			"engines": {
+				"node": ">=4"
+			}
+		},
+		"node_modules/prelude-ls": {
+			"version": "1.2.1",
+			"resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz",
+			"integrity": "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 0.8.0"
+			}
+		},
+		"node_modules/prettier": {
+			"version": "3.6.2",
+			"resolved": "https://registry.npmjs.org/prettier/-/prettier-3.6.2.tgz",
+			"integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"bin": {
+				"prettier": "bin/prettier.cjs"
+			},
+			"engines": {
+				"node": ">=14"
+			},
+			"funding": {
+				"url": "https://github.com/prettier/prettier?sponsor=1"
+			}
+		},
+		"node_modules/prettier-plugin-svelte": {
+			"version": "3.4.0",
+			"resolved": "https://registry.npmjs.org/prettier-plugin-svelte/-/prettier-plugin-svelte-3.4.0.tgz",
+			"integrity": "sha512-pn1ra/0mPObzqoIQn/vUTR3ZZI6UuZ0sHqMK5x2jMLGrs53h0sXhkVuDcrlssHwIMk7FYrMjHBPoUSyyEEDlBQ==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"peerDependencies": {
+				"prettier": "^3.0.0",
+				"svelte": "^3.2.0 || ^4.0.0-next.0 || ^5.0.0-next.0"
+			}
+		},
+		"node_modules/prettier-plugin-tailwindcss": {
+			"version": "0.6.14",
+			"resolved": "https://registry.npmjs.org/prettier-plugin-tailwindcss/-/prettier-plugin-tailwindcss-0.6.14.tgz",
+			"integrity": "sha512-pi2e/+ZygeIqntN+vC573BcW5Cve8zUB0SSAGxqpB4f96boZF4M3phPVoOFCeypwkpRYdi7+jQ5YJJUwrkGUAg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=14.21.3"
+			},
+			"peerDependencies": {
+				"@ianvs/prettier-plugin-sort-imports": "*",
+				"@prettier/plugin-hermes": "*",
+				"@prettier/plugin-oxc": "*",
+				"@prettier/plugin-pug": "*",
+				"@shopify/prettier-plugin-liquid": "*",
+				"@trivago/prettier-plugin-sort-imports": "*",
+				"@zackad/prettier-plugin-twig": "*",
+				"prettier": "^3.0",
+				"prettier-plugin-astro": "*",
+				"prettier-plugin-css-order": "*",
+				"prettier-plugin-import-sort": "*",
+				"prettier-plugin-jsdoc": "*",
+				"prettier-plugin-marko": "*",
+				"prettier-plugin-multiline-arrays": "*",
+				"prettier-plugin-organize-attributes": "*",
+				"prettier-plugin-organize-imports": "*",
+				"prettier-plugin-sort-imports": "*",
+				"prettier-plugin-style-order": "*",
+				"prettier-plugin-svelte": "*"
+			},
+			"peerDependenciesMeta": {
+				"@ianvs/prettier-plugin-sort-imports": {
+					"optional": true
+				},
+				"@prettier/plugin-hermes": {
+					"optional": true
+				},
+				"@prettier/plugin-oxc": {
+					"optional": true
+				},
+				"@prettier/plugin-pug": {
+					"optional": true
+				},
+				"@shopify/prettier-plugin-liquid": {
+					"optional": true
+				},
+				"@trivago/prettier-plugin-sort-imports": {
+					"optional": true
+				},
+				"@zackad/prettier-plugin-twig": {
+					"optional": true
+				},
+				"prettier-plugin-astro": {
+					"optional": true
+				},
+				"prettier-plugin-css-order": {
+					"optional": true
+				},
+				"prettier-plugin-import-sort": {
+					"optional": true
+				},
+				"prettier-plugin-jsdoc": {
+					"optional": true
+				},
+				"prettier-plugin-marko": {
+					"optional": true
+				},
+				"prettier-plugin-multiline-arrays": {
+					"optional": true
+				},
+				"prettier-plugin-organize-attributes": {
+					"optional": true
+				},
+				"prettier-plugin-organize-imports": {
+					"optional": true
+				},
+				"prettier-plugin-sort-imports": {
+					"optional": true
+				},
+				"prettier-plugin-style-order": {
+					"optional": true
+				},
+				"prettier-plugin-svelte": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/pretty-format": {
+			"version": "27.5.1",
+			"resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-27.5.1.tgz",
+			"integrity": "sha512-Qb1gy5OrP5+zDf2Bvnzdl3jsTf1qXVMazbvCoKhtKqVs4/YK4ozX4gKQJJVyNe+cajNPn0KoC0MC3FUmaHWEmQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ansi-regex": "^5.0.1",
+				"ansi-styles": "^5.0.0",
+				"react-is": "^17.0.1"
+			},
+			"engines": {
+				"node": "^10.13.0 || ^12.13.0 || ^14.15.0 || >=15.0.0"
+			}
+		},
+		"node_modules/pretty-format/node_modules/ansi-styles": {
+			"version": "5.2.0",
+			"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz",
+			"integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/chalk/ansi-styles?sponsor=1"
+			}
+		},
+		"node_modules/prism-svelte": {
+			"version": "0.4.7",
+			"resolved": "https://registry.npmjs.org/prism-svelte/-/prism-svelte-0.4.7.tgz",
+			"integrity": "sha512-yABh19CYbM24V7aS7TuPYRNMqthxwbvx6FF/Rw920YbyBWO3tnyPIqRMgHuSVsLmuHkkBS1Akyof463FVdkeDQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/prismjs": {
+			"version": "1.30.0",
+			"resolved": "https://registry.npmjs.org/prismjs/-/prismjs-1.30.0.tgz",
+			"integrity": "sha512-DEvV2ZF2r2/63V+tK8hQvrR2ZGn10srHbXviTlcv7Kpzw8jWiNTqbVgjO3IY8RxrrOUF8VPMQQFysYYYv0YZxw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/prompts": {
+			"version": "2.4.2",
+			"resolved": "https://registry.npmjs.org/prompts/-/prompts-2.4.2.tgz",
+			"integrity": "sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"kleur": "^3.0.3",
+				"sisteransi": "^1.0.5"
+			},
+			"engines": {
+				"node": ">= 6"
+			}
+		},
+		"node_modules/prompts/node_modules/kleur": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/kleur/-/kleur-3.0.3.tgz",
+			"integrity": "sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/property-information": {
+			"version": "7.1.0",
+			"resolved": "https://registry.npmjs.org/property-information/-/property-information-7.1.0.tgz",
+			"integrity": "sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/punycode": {
+			"version": "2.3.1",
+			"resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
+			"integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/qs": {
+			"version": "6.14.0",
+			"resolved": "https://registry.npmjs.org/qs/-/qs-6.14.0.tgz",
+			"integrity": "sha512-YWWTjgABSKcvs/nWBi9PycY/JiPJqOD4JA6o9Sej2AtvSGarXxKC3OQSk4pAarbdQlKAh5D4FCQkJNkW+GAn3w==",
+			"dev": true,
+			"license": "BSD-3-Clause",
+			"dependencies": {
+				"side-channel": "^1.1.0"
+			},
+			"engines": {
+				"node": ">=0.6"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/queue-microtask": {
+			"version": "1.2.3",
+			"resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
+			"integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==",
+			"dev": true,
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/feross"
+				},
+				{
+					"type": "patreon",
+					"url": "https://www.patreon.com/feross"
+				},
+				{
+					"type": "consulting",
+					"url": "https://feross.org/support"
+				}
+			],
+			"license": "MIT"
+		},
+		"node_modules/react": {
+			"version": "19.1.0",
+			"resolved": "https://registry.npmjs.org/react/-/react-19.1.0.tgz",
+			"integrity": "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/react-dom": {
+			"version": "19.1.0",
+			"resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.0.tgz",
+			"integrity": "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"scheduler": "^0.26.0"
+			},
+			"peerDependencies": {
+				"react": "^19.1.0"
+			}
+		},
+		"node_modules/react-is": {
+			"version": "17.0.2",
+			"resolved": "https://registry.npmjs.org/react-is/-/react-is-17.0.2.tgz",
+			"integrity": "sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/readdirp": {
+			"version": "4.1.2",
+			"resolved": "https://registry.npmjs.org/readdirp/-/readdirp-4.1.2.tgz",
+			"integrity": "sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 14.18.0"
+			},
+			"funding": {
+				"type": "individual",
+				"url": "https://paulmillr.com/funding/"
+			}
+		},
+		"node_modules/recast": {
+			"version": "0.23.11",
+			"resolved": "https://registry.npmjs.org/recast/-/recast-0.23.11.tgz",
+			"integrity": "sha512-YTUo+Flmw4ZXiWfQKGcwwc11KnoRAYgzAE2E7mXKCjSviTKShtxBsN6YUUBB2gtaBzKzeKunxhUwNHQuRryhWA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ast-types": "^0.16.1",
+				"esprima": "~4.0.0",
+				"source-map": "~0.6.1",
+				"tiny-invariant": "^1.3.3",
+				"tslib": "^2.0.1"
+			},
+			"engines": {
+				"node": ">= 4"
+			}
+		},
+		"node_modules/redent": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/redent/-/redent-3.0.0.tgz",
+			"integrity": "sha512-6tDA8g98We0zd0GvVeMT9arEOnTw9qM03L9cJXaCjrip1OO764RDBLBfrB4cwzNGDj5OA5ioymC9GkizgWJDUg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"indent-string": "^4.0.0",
+				"strip-indent": "^3.0.0"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/rehype-highlight": {
+			"version": "7.0.2",
+			"resolved": "https://registry.npmjs.org/rehype-highlight/-/rehype-highlight-7.0.2.tgz",
+			"integrity": "sha512-k158pK7wdC2qL3M5NcZROZ2tR/l7zOzjxXd5VGdcfIyoijjQqpHd3JKtYSBDpDZ38UI2WJWuFAtkMDxmx5kstA==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"hast-util-to-text": "^4.0.0",
+				"lowlight": "^3.0.0",
+				"unist-util-visit": "^5.0.0",
+				"vfile": "^6.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/rehype-katex": {
+			"version": "7.0.1",
+			"resolved": "https://registry.npmjs.org/rehype-katex/-/rehype-katex-7.0.1.tgz",
+			"integrity": "sha512-OiM2wrZ/wuhKkigASodFoo8wimG3H12LWQaH8qSPVJn9apWKFSH3YOCtbKpBorTVw/eI7cuT21XBbvwEswbIOA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"@types/katex": "^0.16.0",
+				"hast-util-from-html-isomorphic": "^2.0.0",
+				"hast-util-to-text": "^4.0.0",
+				"katex": "^0.16.0",
+				"unist-util-visit-parents": "^6.0.0",
+				"vfile": "^6.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/rehype-stringify": {
+			"version": "10.0.1",
+			"resolved": "https://registry.npmjs.org/rehype-stringify/-/rehype-stringify-10.0.1.tgz",
+			"integrity": "sha512-k9ecfXHmIPuFVI61B9DeLPN0qFHfawM6RsuX48hoqlaKSF61RskNjSm1lI8PhBEM0MRdLxVVm4WmTqJQccH9mA==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"hast-util-to-html": "^9.0.0",
+				"unified": "^11.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/remark": {
+			"version": "15.0.1",
+			"resolved": "https://registry.npmjs.org/remark/-/remark-15.0.1.tgz",
+			"integrity": "sha512-Eht5w30ruCXgFmxVUSlNWQ9iiimq07URKeFS3hNc8cUWy1llX4KDWfyEDZRycMc+znsN9Ux5/tJ/BFdgdOwA3A==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"remark-parse": "^11.0.0",
+				"remark-stringify": "^11.0.0",
+				"unified": "^11.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/remark-breaks": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/remark-breaks/-/remark-breaks-4.0.0.tgz",
+			"integrity": "sha512-IjEjJOkH4FuJvHZVIW0QCDWxcG96kCq7An/KVH2NfJe6rKZU2AsHeB3OEjPNRxi4QC34Xdx7I2KGYn6IpT7gxQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"mdast-util-newline-to-break": "^2.0.0",
+				"unified": "^11.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/remark-gfm": {
+			"version": "4.0.1",
+			"resolved": "https://registry.npmjs.org/remark-gfm/-/remark-gfm-4.0.1.tgz",
+			"integrity": "sha512-1quofZ2RQ9EWdeN34S79+KExV1764+wCUGop5CPL1WGdD0ocPpu91lzPGbwWMECpEpd42kJGQwzRfyov9j4yNg==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"mdast-util-gfm": "^3.0.0",
+				"micromark-extension-gfm": "^3.0.0",
+				"remark-parse": "^11.0.0",
+				"remark-stringify": "^11.0.0",
+				"unified": "^11.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/remark-html": {
+			"version": "16.0.1",
+			"resolved": "https://registry.npmjs.org/remark-html/-/remark-html-16.0.1.tgz",
+			"integrity": "sha512-B9JqA5i0qZe0Nsf49q3OXyGvyXuZFDzAP2iOFLEumymuYJITVpiH1IgsTEwTpdptDmZlMDMWeDmSawdaJIGCXQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"hast-util-sanitize": "^5.0.0",
+				"hast-util-to-html": "^9.0.0",
+				"mdast-util-to-hast": "^13.0.0",
+				"unified": "^11.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/remark-math": {
+			"version": "6.0.0",
+			"resolved": "https://registry.npmjs.org/remark-math/-/remark-math-6.0.0.tgz",
+			"integrity": "sha512-MMqgnP74Igy+S3WwnhQ7kqGlEerTETXMvJhrUzDikVZ2/uogJCb+WHUg97hK9/jcfc0dkD73s3LN8zU49cTEtA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"mdast-util-math": "^3.0.0",
+				"micromark-extension-math": "^3.0.0",
+				"unified": "^11.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/remark-parse": {
+			"version": "11.0.0",
+			"resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz",
+			"integrity": "sha512-FCxlKLNGknS5ba/1lmpYijMUzX2esxW5xQqjWxw2eHFfS2MSdaHVINFmhjo+qN1WhZhNimq0dZATN9pH0IDrpA==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"mdast-util-from-markdown": "^2.0.0",
+				"micromark-util-types": "^2.0.0",
+				"unified": "^11.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/remark-rehype": {
+			"version": "11.1.2",
+			"resolved": "https://registry.npmjs.org/remark-rehype/-/remark-rehype-11.1.2.tgz",
+			"integrity": "sha512-Dh7l57ianaEoIpzbp0PC9UKAdCSVklD8E5Rpw7ETfbTl3FqcOOgq5q2LVDhgGCkaBv7p24JXikPdvhhmHvKMsw==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/hast": "^3.0.0",
+				"@types/mdast": "^4.0.0",
+				"mdast-util-to-hast": "^13.0.0",
+				"unified": "^11.0.0",
+				"vfile": "^6.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/remark-stringify": {
+			"version": "11.0.0",
+			"resolved": "https://registry.npmjs.org/remark-stringify/-/remark-stringify-11.0.0.tgz",
+			"integrity": "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/mdast": "^4.0.0",
+				"mdast-util-to-markdown": "^2.0.0",
+				"unified": "^11.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/requires-port": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz",
+			"integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/resolve-from": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
+			"integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=4"
+			}
+		},
+		"node_modules/reusify": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/reusify/-/reusify-1.1.0.tgz",
+			"integrity": "sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"iojs": ">=1.0.0",
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/rollup": {
+			"version": "4.45.1",
+			"resolved": "https://registry.npmjs.org/rollup/-/rollup-4.45.1.tgz",
+			"integrity": "sha512-4iya7Jb76fVpQyLoiVpzUrsjQ12r3dM7fIVz+4NwoYvZOShknRmiv+iu9CClZml5ZLGb0XMcYLutK6w9tgxHDw==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@types/estree": "1.0.8"
+			},
+			"bin": {
+				"rollup": "dist/bin/rollup"
+			},
+			"engines": {
+				"node": ">=18.0.0",
+				"npm": ">=8.0.0"
+			},
+			"optionalDependencies": {
+				"@rollup/rollup-android-arm-eabi": "4.45.1",
+				"@rollup/rollup-android-arm64": "4.45.1",
+				"@rollup/rollup-darwin-arm64": "4.45.1",
+				"@rollup/rollup-darwin-x64": "4.45.1",
+				"@rollup/rollup-freebsd-arm64": "4.45.1",
+				"@rollup/rollup-freebsd-x64": "4.45.1",
+				"@rollup/rollup-linux-arm-gnueabihf": "4.45.1",
+				"@rollup/rollup-linux-arm-musleabihf": "4.45.1",
+				"@rollup/rollup-linux-arm64-gnu": "4.45.1",
+				"@rollup/rollup-linux-arm64-musl": "4.45.1",
+				"@rollup/rollup-linux-loongarch64-gnu": "4.45.1",
+				"@rollup/rollup-linux-powerpc64le-gnu": "4.45.1",
+				"@rollup/rollup-linux-riscv64-gnu": "4.45.1",
+				"@rollup/rollup-linux-riscv64-musl": "4.45.1",
+				"@rollup/rollup-linux-s390x-gnu": "4.45.1",
+				"@rollup/rollup-linux-x64-gnu": "4.45.1",
+				"@rollup/rollup-linux-x64-musl": "4.45.1",
+				"@rollup/rollup-win32-arm64-msvc": "4.45.1",
+				"@rollup/rollup-win32-ia32-msvc": "4.45.1",
+				"@rollup/rollup-win32-x64-msvc": "4.45.1",
+				"fsevents": "~2.3.2"
+			}
+		},
+		"node_modules/run-parallel": {
+			"version": "1.2.0",
+			"resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
+			"integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==",
+			"dev": true,
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/feross"
+				},
+				{
+					"type": "patreon",
+					"url": "https://www.patreon.com/feross"
+				},
+				{
+					"type": "consulting",
+					"url": "https://feross.org/support"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"queue-microtask": "^1.2.2"
+			}
+		},
+		"node_modules/runed": {
+			"version": "0.25.0",
+			"resolved": "https://registry.npmjs.org/runed/-/runed-0.25.0.tgz",
+			"integrity": "sha512-7+ma4AG9FT2sWQEA0Egf6mb7PBT2vHyuHail1ie8ropfSjvZGtEAx8YTmUjv/APCsdRRxEVvArNjALk9zFSOrg==",
+			"funding": [
+				"https://github.com/sponsors/huntabyte",
+				"https://github.com/sponsors/tglide"
+			],
+			"dependencies": {
+				"esm-env": "^1.0.0"
+			},
+			"peerDependencies": {
+				"svelte": "^5.7.0"
+			}
+		},
+		"node_modules/sade": {
+			"version": "1.8.1",
+			"resolved": "https://registry.npmjs.org/sade/-/sade-1.8.1.tgz",
+			"integrity": "sha512-xal3CZX1Xlo/k4ApwCFrHVACi9fBqJ7V+mwhBsuf/1IOKbBy098Fex+Wa/5QMubw09pSZ/u8EY8PWgevJsXp1A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"mri": "^1.1.0"
+			},
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/safe-buffer": {
+			"version": "5.1.2",
+			"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
+			"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/safer-buffer": {
+			"version": "2.1.2",
+			"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
+			"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/sass": {
+			"version": "1.93.3",
+			"resolved": "https://registry.npmjs.org/sass/-/sass-1.93.3.tgz",
+			"integrity": "sha512-elOcIZRTM76dvxNAjqYrucTSI0teAF/L2Lv0s6f6b7FOwcwIuA357bIE871580AjHJuSvLIRUosgV+lIWx6Rgg==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"chokidar": "^4.0.0",
+				"immutable": "^5.0.2",
+				"source-map-js": ">=0.6.2 <2.0.0"
+			},
+			"bin": {
+				"sass": "sass.js"
+			},
+			"engines": {
+				"node": ">=14.0.0"
+			},
+			"optionalDependencies": {
+				"@parcel/watcher": "^2.4.1"
+			}
+		},
+		"node_modules/scheduler": {
+			"version": "0.26.0",
+			"resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.26.0.tgz",
+			"integrity": "sha512-NlHwttCI/l5gCPR3D1nNXtWABUmBwvZpEQiD4IXSbIDq8BzLIK/7Ir5gTFSGZDUu37K5cMNp0hFtzO38sC7gWA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/scule": {
+			"version": "1.3.0",
+			"resolved": "https://registry.npmjs.org/scule/-/scule-1.3.0.tgz",
+			"integrity": "sha512-6FtHJEvt+pVMIB9IBY+IcCJ6Z5f1iQnytgyfKMhDKgmzYG+TeH/wx1y3l27rshSbLiSanrR9ffZDrEsmjlQF2g==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/secure-compare": {
+			"version": "3.0.1",
+			"resolved": "https://registry.npmjs.org/secure-compare/-/secure-compare-3.0.1.tgz",
+			"integrity": "sha512-AckIIV90rPDcBcglUwXPF3kg0P0qmPsPXAj6BBEENQE1p5yA1xfmDJzfi1Tappj37Pv2mVbKpL3Z1T+Nn7k1Qw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/semver": {
+			"version": "7.7.2",
+			"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz",
+			"integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==",
+			"dev": true,
+			"license": "ISC",
+			"bin": {
+				"semver": "bin/semver.js"
+			},
+			"engines": {
+				"node": ">=10"
+			}
+		},
+		"node_modules/set-cookie-parser": {
+			"version": "2.7.1",
+			"resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.7.1.tgz",
+			"integrity": "sha512-IOc8uWeOZgnb3ptbCURJWNjWUPcO3ZnTTdzsurqERrP6nPyv+paC55vJM0LpOlT2ne+Ix+9+CRG1MNLlyZ4GjQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/shebang-command": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
+			"integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"shebang-regex": "^3.0.0"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/shebang-regex": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz",
+			"integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/side-channel": {
+			"version": "1.1.0",
+			"resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.1.0.tgz",
+			"integrity": "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-errors": "^1.3.0",
+				"object-inspect": "^1.13.3",
+				"side-channel-list": "^1.0.0",
+				"side-channel-map": "^1.0.1",
+				"side-channel-weakmap": "^1.0.2"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/side-channel-list": {
+			"version": "1.0.0",
+			"resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz",
+			"integrity": "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"es-errors": "^1.3.0",
+				"object-inspect": "^1.13.3"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/side-channel-map": {
+			"version": "1.0.1",
+			"resolved": "https://registry.npmjs.org/side-channel-map/-/side-channel-map-1.0.1.tgz",
+			"integrity": "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.2",
+				"es-errors": "^1.3.0",
+				"get-intrinsic": "^1.2.5",
+				"object-inspect": "^1.13.3"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/side-channel-weakmap": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/side-channel-weakmap/-/side-channel-weakmap-1.0.2.tgz",
+			"integrity": "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"call-bound": "^1.0.2",
+				"es-errors": "^1.3.0",
+				"get-intrinsic": "^1.2.5",
+				"object-inspect": "^1.13.3",
+				"side-channel-map": "^1.0.1"
+			},
+			"engines": {
+				"node": ">= 0.4"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ljharb"
+			}
+		},
+		"node_modules/siginfo": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/siginfo/-/siginfo-2.0.0.tgz",
+			"integrity": "sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==",
+			"dev": true,
+			"license": "ISC"
+		},
+		"node_modules/sirv": {
+			"version": "3.0.1",
+			"resolved": "https://registry.npmjs.org/sirv/-/sirv-3.0.1.tgz",
+			"integrity": "sha512-FoqMu0NCGBLCcAkS1qA+XJIQTR6/JHfQXl+uGteNCQ76T91DMUjPa9xfmeqMY3z80nLSg9yQmNjK0Px6RWsH/A==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@polka/url": "^1.0.0-next.24",
+				"mrmime": "^2.0.0",
+				"totalist": "^3.0.0"
+			},
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/sisteransi": {
+			"version": "1.0.5",
+			"resolved": "https://registry.npmjs.org/sisteransi/-/sisteransi-1.0.5.tgz",
+			"integrity": "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/source-map": {
+			"version": "0.6.1",
+			"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
+			"integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
+			"dev": true,
+			"license": "BSD-3-Clause",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/source-map-js": {
+			"version": "1.2.1",
+			"resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
+			"integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==",
+			"dev": true,
+			"license": "BSD-3-Clause",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/space-separated-tokens": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz",
+			"integrity": "sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/stackback": {
+			"version": "0.0.2",
+			"resolved": "https://registry.npmjs.org/stackback/-/stackback-0.0.2.tgz",
+			"integrity": "sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/std-env": {
+			"version": "3.9.0",
+			"resolved": "https://registry.npmjs.org/std-env/-/std-env-3.9.0.tgz",
+			"integrity": "sha512-UGvjygr6F6tpH7o2qyqR6QYpwraIjKSdtzyBdyytFOHmPZY917kwdwLG0RbOjWOnKmnm3PeHjaoLLMie7kPLQw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/storybook": {
+			"version": "10.0.7",
+			"resolved": "https://registry.npmjs.org/storybook/-/storybook-10.0.7.tgz",
+			"integrity": "sha512-7smAu0o+kdm378Q2uIddk32pn0UdIbrtTVU+rXRVtTVTCrK/P2cCui2y4JH+Bl3NgEq1bbBQpCAF/HKrDjk2Qw==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@storybook/global": "^5.0.0",
+				"@storybook/icons": "^1.6.0",
+				"@testing-library/jest-dom": "^6.6.3",
+				"@testing-library/user-event": "^14.6.1",
+				"@vitest/expect": "3.2.4",
+				"@vitest/mocker": "3.2.4",
+				"@vitest/spy": "3.2.4",
+				"esbuild": "^0.18.0 || ^0.19.0 || ^0.20.0 || ^0.21.0 || ^0.22.0 || ^0.23.0 || ^0.24.0 || ^0.25.0",
+				"recast": "^0.23.5",
+				"semver": "^7.6.2",
+				"ws": "^8.18.0"
+			},
+			"bin": {
+				"storybook": "dist/bin/dispatcher.js"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/storybook"
+			},
+			"peerDependencies": {
+				"prettier": "^2 || ^3"
+			},
+			"peerDependenciesMeta": {
+				"prettier": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/stringify-entities": {
+			"version": "4.0.4",
+			"resolved": "https://registry.npmjs.org/stringify-entities/-/stringify-entities-4.0.4.tgz",
+			"integrity": "sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg==",
+			"license": "MIT",
+			"dependencies": {
+				"character-entities-html4": "^2.0.0",
+				"character-entities-legacy": "^3.0.0"
+			},
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/strip-ansi": {
+			"version": "7.1.0",
+			"resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz",
+			"integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"ansi-regex": "^6.0.1"
+			},
+			"engines": {
+				"node": ">=12"
+			},
+			"funding": {
+				"url": "https://github.com/chalk/strip-ansi?sponsor=1"
+			}
+		},
+		"node_modules/strip-ansi/node_modules/ansi-regex": {
+			"version": "6.1.0",
+			"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz",
+			"integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=12"
+			},
+			"funding": {
+				"url": "https://github.com/chalk/ansi-regex?sponsor=1"
+			}
+		},
+		"node_modules/strip-indent": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/strip-indent/-/strip-indent-3.0.0.tgz",
+			"integrity": "sha512-laJTa3Jb+VQpaC6DseHhF7dXVqHTfJPCRDaEbid/drOhgitgYku/letMUqOXFoWV0zIIUbjpdH2t+tYj4bQMRQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"min-indent": "^1.0.0"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/strip-json-comments": {
+			"version": "3.1.1",
+			"resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
+			"integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=8"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/strip-literal": {
+			"version": "3.0.0",
+			"resolved": "https://registry.npmjs.org/strip-literal/-/strip-literal-3.0.0.tgz",
+			"integrity": "sha512-TcccoMhJOM3OebGhSBEmp3UZ2SfDMZUEBdRA/9ynfLi8yYajyWX3JiXArcJt4Umh4vISpspkQIY8ZZoCqjbviA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"js-tokens": "^9.0.1"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/antfu"
+			}
+		},
+		"node_modules/strip-literal/node_modules/js-tokens": {
+			"version": "9.0.1",
+			"resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-9.0.1.tgz",
+			"integrity": "sha512-mxa9E9ITFOt0ban3j6L5MpjwegGz6lBQmM1IJkWeBZGcMxto50+eWdjC/52xDbS2vy0k7vIMK0Fe2wfL9OQSpQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/style-to-object": {
+			"version": "1.0.9",
+			"resolved": "https://registry.npmjs.org/style-to-object/-/style-to-object-1.0.9.tgz",
+			"integrity": "sha512-G4qppLgKu/k6FwRpHiGiKPaPTFcG3g4wNVX/Qsfu+RqQM30E7Tyu/TEgxcL9PNLF5pdRLwQdE3YKKf+KF2Dzlw==",
+			"license": "MIT",
+			"dependencies": {
+				"inline-style-parser": "0.2.4"
+			}
+		},
+		"node_modules/supports-color": {
+			"version": "7.2.0",
+			"resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+			"integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"has-flag": "^4.0.0"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/svelte": {
+			"version": "5.48.3",
+			"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.48.3.tgz",
+			"integrity": "sha512-w7QZ398cdNherTdiQ/v3SYLLGOO4948Jgjh04PYqtTYVohmBvbmFwLmo7pp8gp4/1tceRWfSTjHgjtfpCVNJmQ==",
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@jridgewell/remapping": "^2.3.4",
+				"@jridgewell/sourcemap-codec": "^1.5.0",
+				"@sveltejs/acorn-typescript": "^1.0.5",
+				"@types/estree": "^1.0.5",
+				"acorn": "^8.12.1",
+				"aria-query": "^5.3.1",
+				"axobject-query": "^4.1.0",
+				"clsx": "^2.1.1",
+				"devalue": "^5.6.2",
+				"esm-env": "^1.2.1",
+				"esrap": "^2.2.1",
+				"is-reference": "^3.0.3",
+				"locate-character": "^3.0.0",
+				"magic-string": "^0.30.11",
+				"zimmerframe": "^1.1.2"
+			},
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/svelte-ast-print": {
+			"version": "0.4.2",
+			"resolved": "https://registry.npmjs.org/svelte-ast-print/-/svelte-ast-print-0.4.2.tgz",
+			"integrity": "sha512-hRHHufbJoArFmDYQKCpCvc0xUuIEfwYksvyLYEQyH+1xb5LD5sM/IthfooCdXZQtOIqXz6xm7NmaqdfwG4kh6w==",
+			"dev": true,
+			"funding": [
+				{
+					"type": "github",
+					"url": "https://github.com/sponsors/xeho91"
+				},
+				{
+					"type": "opencollective",
+					"url": "https://opencollective.com/xeho91"
+				}
+			],
+			"license": "MIT",
+			"dependencies": {
+				"esrap": "1.2.2",
+				"zimmerframe": "1.1.2"
+			},
+			"engines": {
+				"node": ">=18"
+			},
+			"peerDependencies": {
+				"svelte": "^5.0.0"
+			}
+		},
+		"node_modules/svelte-ast-print/node_modules/esrap": {
+			"version": "1.2.2",
+			"resolved": "https://registry.npmjs.org/esrap/-/esrap-1.2.2.tgz",
+			"integrity": "sha512-F2pSJklxx1BlQIQgooczXCPHmcWpn6EsP5oo73LQfonG9fIlIENQ8vMmfGXeojP9MrkzUNAfyU5vdFlR9shHAw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@jridgewell/sourcemap-codec": "^1.4.15",
+				"@types/estree": "^1.0.1"
+			}
+		},
+		"node_modules/svelte-check": {
+			"version": "4.3.0",
+			"resolved": "https://registry.npmjs.org/svelte-check/-/svelte-check-4.3.0.tgz",
+			"integrity": "sha512-Iz8dFXzBNAM7XlEIsUjUGQhbEE+Pvv9odb9+0+ITTgFWZBGeJRRYqHUUglwe2EkLD5LIsQaAc4IUJyvtKuOO5w==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@jridgewell/trace-mapping": "^0.3.25",
+				"chokidar": "^4.0.1",
+				"fdir": "^6.2.0",
+				"picocolors": "^1.0.0",
+				"sade": "^1.7.4"
+			},
+			"bin": {
+				"svelte-check": "bin/svelte-check"
+			},
+			"engines": {
+				"node": ">= 18.0.0"
+			},
+			"peerDependencies": {
+				"svelte": "^4.0.0 || ^5.0.0-next.0",
+				"typescript": ">=5.0.0"
+			}
+		},
+		"node_modules/svelte-eslint-parser": {
+			"version": "1.3.0",
+			"resolved": "https://registry.npmjs.org/svelte-eslint-parser/-/svelte-eslint-parser-1.3.0.tgz",
+			"integrity": "sha512-VCgMHKV7UtOGcGLGNFSbmdm6kEKjtzo5nnpGU/mnx4OsFY6bZ7QwRF5DUx+Hokw5Lvdyo8dpk8B1m8mliomrNg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"eslint-scope": "^8.2.0",
+				"eslint-visitor-keys": "^4.0.0",
+				"espree": "^10.0.0",
+				"postcss": "^8.4.49",
+				"postcss-scss": "^4.0.9",
+				"postcss-selector-parser": "^7.0.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/ota-meshi"
+			},
+			"peerDependencies": {
+				"svelte": "^3.37.0 || ^4.0.0 || ^5.0.0"
+			},
+			"peerDependenciesMeta": {
+				"svelte": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/svelte-eslint-parser/node_modules/postcss-selector-parser": {
+			"version": "7.1.0",
+			"resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-7.1.0.tgz",
+			"integrity": "sha512-8sLjZwK0R+JlxlYcTuVnyT2v+htpdrjDOKuMcOVdYjt52Lh8hWRYpxBPoKx/Zg+bcjc3wx6fmQevMmUztS/ccA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"cssesc": "^3.0.0",
+				"util-deprecate": "^1.0.2"
+			},
+			"engines": {
+				"node": ">=4"
+			}
+		},
+		"node_modules/svelte-sonner": {
+			"version": "1.0.5",
+			"resolved": "https://registry.npmjs.org/svelte-sonner/-/svelte-sonner-1.0.5.tgz",
+			"integrity": "sha512-9dpGPFqKb/QWudYqGnEz93vuY+NgCEvyNvxoCLMVGw6sDN/3oVeKV1xiEirW2E1N3vJEyj5imSBNOGltQHA7mg==",
+			"license": "MIT",
+			"dependencies": {
+				"runed": "^0.28.0"
+			},
+			"peerDependencies": {
+				"svelte": "^5.0.0"
+			}
+		},
+		"node_modules/svelte-sonner/node_modules/runed": {
+			"version": "0.28.0",
+			"resolved": "https://registry.npmjs.org/runed/-/runed-0.28.0.tgz",
+			"integrity": "sha512-k2xx7RuO9hWcdd9f+8JoBeqWtYrm5CALfgpkg2YDB80ds/QE4w0qqu34A7fqiAwiBBSBQOid7TLxwxVC27ymWQ==",
+			"funding": [
+				"https://github.com/sponsors/huntabyte",
+				"https://github.com/sponsors/tglide"
+			],
+			"license": "MIT",
+			"dependencies": {
+				"esm-env": "^1.0.0"
+			},
+			"peerDependencies": {
+				"svelte": "^5.7.0"
+			}
+		},
+		"node_modules/svelte-toolbelt": {
+			"version": "0.7.1",
+			"resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.7.1.tgz",
+			"integrity": "sha512-HcBOcR17Vx9bjaOceUvxkY3nGmbBmCBBbuWLLEWO6jtmWH8f/QoWmbyUfQZrpDINH39en1b8mptfPQT9VKQ1xQ==",
+			"funding": [
+				"https://github.com/sponsors/huntabyte"
+			],
+			"dependencies": {
+				"clsx": "^2.1.1",
+				"runed": "^0.23.2",
+				"style-to-object": "^1.0.8"
+			},
+			"engines": {
+				"node": ">=18",
+				"pnpm": ">=8.7.0"
+			},
+			"peerDependencies": {
+				"svelte": "^5.0.0"
+			}
+		},
+		"node_modules/svelte-toolbelt/node_modules/runed": {
+			"version": "0.23.4",
+			"resolved": "https://registry.npmjs.org/runed/-/runed-0.23.4.tgz",
+			"integrity": "sha512-9q8oUiBYeXIDLWNK5DfCWlkL0EW3oGbk845VdKlPeia28l751VpfesaB/+7pI6rnbx1I6rqoZ2fZxptOJLxILA==",
+			"funding": [
+				"https://github.com/sponsors/huntabyte",
+				"https://github.com/sponsors/tglide"
+			],
+			"dependencies": {
+				"esm-env": "^1.0.0"
+			},
+			"peerDependencies": {
+				"svelte": "^5.7.0"
+			}
+		},
+		"node_modules/svelte/node_modules/aria-query": {
+			"version": "5.3.2",
+			"resolved": "https://registry.npmjs.org/aria-query/-/aria-query-5.3.2.tgz",
+			"integrity": "sha512-COROpnaoap1E2F000S62r6A60uHZnmlvomhfyT2DlTcrY1OrBKn2UhH7qn5wTC9zMvD0AY7csdPSNwKP+7WiQw==",
+			"license": "Apache-2.0",
+			"engines": {
+				"node": ">= 0.4"
+			}
+		},
+		"node_modules/svelte/node_modules/esrap": {
+			"version": "2.2.2",
+			"resolved": "https://registry.npmjs.org/esrap/-/esrap-2.2.2.tgz",
+			"integrity": "sha512-zA6497ha+qKvoWIK+WM9NAh5ni17sKZKhbS5B3PoYbBvaYHZWoS33zmFybmyqpn07RLUxSmn+RCls2/XF+d0oQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@jridgewell/sourcemap-codec": "^1.4.15"
+			}
+		},
+		"node_modules/svelte/node_modules/is-reference": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/is-reference/-/is-reference-3.0.3.tgz",
+			"integrity": "sha512-ixkJoqQvAP88E6wLydLGGqCJsrFUnqoH6HnaczB8XmDH1oaWU+xxdptvikTgaEhtZ53Ky6YXiBuUI2WXLMCwjw==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/estree": "^1.0.6"
+			}
+		},
+		"node_modules/svelte2tsx": {
+			"version": "0.7.45",
+			"resolved": "https://registry.npmjs.org/svelte2tsx/-/svelte2tsx-0.7.45.tgz",
+			"integrity": "sha512-cSci+mYGygYBHIZLHlm/jYlEc1acjAHqaQaDFHdEBpUueM9kSTnPpvPtSl5VkJOU1qSJ7h1K+6F/LIUYiqC8VA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"dedent-js": "^1.0.1",
+				"scule": "^1.3.0"
+			},
+			"peerDependencies": {
+				"svelte": "^3.55 || ^4.0.0-next.0 || ^4.0 || ^5.0.0-next.0",
+				"typescript": "^4.9.4 || ^5.0.0"
+			}
+		},
+		"node_modules/tabbable": {
+			"version": "6.2.0",
+			"resolved": "https://registry.npmjs.org/tabbable/-/tabbable-6.2.0.tgz",
+			"integrity": "sha512-Cat63mxsVJlzYvN51JmVXIgNoUokrIaT2zLclCXjRd8boZ0004U4KCs/sToJ75C6sdlByWxpYnb5Boif1VSFew==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/tailwind-merge": {
+			"version": "3.3.1",
+			"resolved": "https://registry.npmjs.org/tailwind-merge/-/tailwind-merge-3.3.1.tgz",
+			"integrity": "sha512-gBXpgUm/3rp1lMZZrM/w7D8GKqshif0zAymAhbCyIt8KMe+0v9DQ7cdYLR4FHH/cKpdTXb+A/tKKU3eolfsI+g==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/dcastil"
+			}
+		},
+		"node_modules/tailwind-variants": {
+			"version": "3.2.2",
+			"resolved": "https://registry.npmjs.org/tailwind-variants/-/tailwind-variants-3.2.2.tgz",
+			"integrity": "sha512-Mi4kHeMTLvKlM98XPnK+7HoBPmf4gygdFmqQPaDivc3DpYS6aIY6KiG/PgThrGvii5YZJqRsPz0aPyhoFzmZgg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=16.x",
+				"pnpm": ">=7.x"
+			},
+			"peerDependencies": {
+				"tailwind-merge": ">=3.0.0",
+				"tailwindcss": "*"
+			},
+			"peerDependenciesMeta": {
+				"tailwind-merge": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/tailwindcss": {
+			"version": "4.1.11",
+			"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.11.tgz",
+			"integrity": "sha512-2E9TBm6MDD/xKYe+dvJZAmg3yxIEDNRc0jwlNyDg/4Fil2QcSLjFKGVff0lAf1jjeaArlG/M75Ey/EYr/OJtBA==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true
+		},
+		"node_modules/tapable": {
+			"version": "2.2.2",
+			"resolved": "https://registry.npmjs.org/tapable/-/tapable-2.2.2.tgz",
+			"integrity": "sha512-Re10+NauLTMCudc7T5WLFLAwDhQ0JWdrMK+9B2M8zR5hRExKmsRDCBA7/aV/pNJFltmBFO5BAMlQFi/vq3nKOg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/tar": {
+			"version": "7.4.3",
+			"resolved": "https://registry.npmjs.org/tar/-/tar-7.4.3.tgz",
+			"integrity": "sha512-5S7Va8hKfV7W5U6g3aYxXmlPoZVAwUMy9AOKyF2fVuZa2UD3qZjg578OrLRt8PcNN1PleVaL/5/yYATNL0ICUw==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"@isaacs/fs-minipass": "^4.0.0",
+				"chownr": "^3.0.0",
+				"minipass": "^7.1.2",
+				"minizlib": "^3.0.1",
+				"mkdirp": "^3.0.1",
+				"yallist": "^5.0.0"
+			},
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/tiny-invariant": {
+			"version": "1.3.3",
+			"resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.3.tgz",
+			"integrity": "sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/tinybench": {
+			"version": "2.9.0",
+			"resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz",
+			"integrity": "sha512-0+DUvqWMValLmha6lr4kD8iAMK1HzV0/aKnCtWb9v9641TnP/MFb7Pc2bxoxQjTXAErryXVgUOfv2YqNllqGeg==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/tinyexec": {
+			"version": "0.3.2",
+			"resolved": "https://registry.npmjs.org/tinyexec/-/tinyexec-0.3.2.tgz",
+			"integrity": "sha512-KQQR9yN7R5+OSwaK0XQoj22pwHoTlgYqmUscPYoknOoWCWfj/5/ABTMRi69FrKU5ffPVh5QcFikpWJI/P1ocHA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/tinyglobby": {
+			"version": "0.2.15",
+			"resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz",
+			"integrity": "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"fdir": "^6.5.0",
+				"picomatch": "^4.0.3"
+			},
+			"engines": {
+				"node": ">=12.0.0"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/SuperchupuDev"
+			}
+		},
+		"node_modules/tinypool": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/tinypool/-/tinypool-1.1.1.tgz",
+			"integrity": "sha512-Zba82s87IFq9A9XmjiX5uZA/ARWDrB03OHlq+Vw1fSdt0I+4/Kutwy8BP4Y/y/aORMo61FQ0vIb5j44vSo5Pkg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": "^18.0.0 || >=20.0.0"
+			}
+		},
+		"node_modules/tinyrainbow": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-2.0.0.tgz",
+			"integrity": "sha512-op4nsTR47R6p0vMUUoYl/a+ljLFVtlfaXkLQmqfLR1qHma1h/ysYk4hEXZ880bf2CYgTskvTa/e196Vd5dDQXw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=14.0.0"
+			}
+		},
+		"node_modules/tinyspy": {
+			"version": "4.0.3",
+			"resolved": "https://registry.npmjs.org/tinyspy/-/tinyspy-4.0.3.tgz",
+			"integrity": "sha512-t2T/WLB2WRgZ9EpE4jgPJ9w+i66UZfDc8wHh0xrwiRNN+UwH98GIJkTeZqX9rg0i0ptwzqW+uYeIF0T4F8LR7A==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=14.0.0"
+			}
+		},
+		"node_modules/to-regex-range": {
+			"version": "5.0.1",
+			"resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
+			"integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"is-number": "^7.0.0"
+			},
+			"engines": {
+				"node": ">=8.0"
+			}
+		},
+		"node_modules/totalist": {
+			"version": "3.0.1",
+			"resolved": "https://registry.npmjs.org/totalist/-/totalist-3.0.1.tgz",
+			"integrity": "sha512-sf4i37nQ2LBx4m3wB74y+ubopq6W/dIzXg0FDGjsYnZHVa1Da8FH853wlL2gtUhg+xJXjfk3kUZS3BRoQeoQBQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6"
+			}
+		},
+		"node_modules/trim-lines": {
+			"version": "3.0.1",
+			"resolved": "https://registry.npmjs.org/trim-lines/-/trim-lines-3.0.1.tgz",
+			"integrity": "sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/trough": {
+			"version": "2.2.0",
+			"resolved": "https://registry.npmjs.org/trough/-/trough-2.2.0.tgz",
+			"integrity": "sha512-tmMpK00BjZiUyVyvrBK7knerNgmgvcV/KLVyuma/SC+TQN167GrMRciANTz09+k3zW8L8t60jWO1GpfkZdjTaw==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/ts-api-utils": {
+			"version": "2.1.0",
+			"resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.1.0.tgz",
+			"integrity": "sha512-CUgTZL1irw8u29bzrOD/nH85jqyc74D6SshFgujOIA7osm2Rz7dYH77agkx7H4FBNxDq7Cjf+IjaX/8zwFW+ZQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=18.12"
+			},
+			"peerDependencies": {
+				"typescript": ">=4.8.4"
+			}
+		},
+		"node_modules/ts-dedent": {
+			"version": "2.2.0",
+			"resolved": "https://registry.npmjs.org/ts-dedent/-/ts-dedent-2.2.0.tgz",
+			"integrity": "sha512-q5W7tVM71e2xjHZTlgfTDoPF/SmqKG5hddq9SzR49CH2hayqRKJtQ4mtRlSxKaJlR/+9rEM+mnBHf7I2/BQcpQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=6.10"
+			}
+		},
+		"node_modules/tslib": {
+			"version": "2.8.1",
+			"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
+			"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
+			"dev": true,
+			"license": "0BSD"
+		},
+		"node_modules/tw-animate-css": {
+			"version": "1.3.5",
+			"resolved": "https://registry.npmjs.org/tw-animate-css/-/tw-animate-css-1.3.5.tgz",
+			"integrity": "sha512-t3u+0YNoloIhj1mMXs779P6MO9q3p3mvGn4k1n3nJPqJw/glZcuijG2qTSN4z4mgNRfW5ZC3aXJFLwDtiipZXA==",
+			"dev": true,
+			"license": "MIT",
+			"funding": {
+				"url": "https://github.com/sponsors/Wombosvideo"
+			}
+		},
+		"node_modules/type-check": {
+			"version": "0.4.0",
+			"resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz",
+			"integrity": "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"prelude-ls": "^1.2.1"
+			},
+			"engines": {
+				"node": ">= 0.8.0"
+			}
+		},
+		"node_modules/type-fest": {
+			"version": "2.19.0",
+			"resolved": "https://registry.npmjs.org/type-fest/-/type-fest-2.19.0.tgz",
+			"integrity": "sha512-RAH822pAdBgcNMAfWnCBU3CFZcfZ/i1eZjwFU/dsLKumyuuP3niueg2UAukXYF0E2AAoc82ZSSf9J0WQBinzHA==",
+			"dev": true,
+			"license": "(MIT OR CC0-1.0)",
+			"engines": {
+				"node": ">=12.20"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/typescript": {
+			"version": "5.8.3",
+			"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz",
+			"integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==",
+			"dev": true,
+			"license": "Apache-2.0",
+			"peer": true,
+			"bin": {
+				"tsc": "bin/tsc",
+				"tsserver": "bin/tsserver"
+			},
+			"engines": {
+				"node": ">=14.17"
+			}
+		},
+		"node_modules/typescript-eslint": {
+			"version": "8.37.0",
+			"resolved": "https://registry.npmjs.org/typescript-eslint/-/typescript-eslint-8.37.0.tgz",
+			"integrity": "sha512-TnbEjzkE9EmcO0Q2zM+GE8NQLItNAJpMmED1BdgoBMYNdqMhzlbqfdSwiRlAzEK2pA9UzVW0gzaaIzXWg2BjfA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@typescript-eslint/eslint-plugin": "8.37.0",
+				"@typescript-eslint/parser": "8.37.0",
+				"@typescript-eslint/typescript-estree": "8.37.0",
+				"@typescript-eslint/utils": "8.37.0"
+			},
+			"engines": {
+				"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/typescript-eslint"
+			},
+			"peerDependencies": {
+				"eslint": "^8.57.0 || ^9.0.0",
+				"typescript": ">=4.8.4 <5.9.0"
+			}
+		},
+		"node_modules/undici-types": {
+			"version": "6.21.0",
+			"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
+			"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/unified": {
+			"version": "11.0.5",
+			"resolved": "https://registry.npmjs.org/unified/-/unified-11.0.5.tgz",
+			"integrity": "sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0",
+				"bail": "^2.0.0",
+				"devlop": "^1.0.0",
+				"extend": "^3.0.0",
+				"is-plain-obj": "^4.0.0",
+				"trough": "^2.0.0",
+				"vfile": "^6.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/unified/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"license": "MIT"
+		},
+		"node_modules/union": {
+			"version": "0.5.0",
+			"resolved": "https://registry.npmjs.org/union/-/union-0.5.0.tgz",
+			"integrity": "sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==",
+			"dev": true,
+			"dependencies": {
+				"qs": "^6.4.0"
+			},
+			"engines": {
+				"node": ">= 0.8.0"
+			}
+		},
+		"node_modules/unist-util-find-after": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/unist-util-find-after/-/unist-util-find-after-5.0.0.tgz",
+			"integrity": "sha512-amQa0Ep2m6hE2g72AugUItjbuM8X8cGQnFoHk0pGfrFeT9GZhzN5SW8nRsiGKK7Aif4CrACPENkA6P/Lw6fHGQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0",
+				"unist-util-is": "^6.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/unist-util-find-after/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"license": "MIT"
+		},
+		"node_modules/unist-util-is": {
+			"version": "6.0.0",
+			"resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-6.0.0.tgz",
+			"integrity": "sha512-2qCTHimwdxLfz+YzdGfkqNlH0tLi9xjTnHddPmJwtIG9MGsdbutfTc4P+haPD7l7Cjxf/WZj+we5qfVPvvxfYw==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/unist-util-is/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"license": "MIT"
+		},
+		"node_modules/unist-util-position": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/unist-util-position/-/unist-util-position-5.0.0.tgz",
+			"integrity": "sha512-fucsC7HjXvkB5R3kTCO7kUjRdrS0BJt3M/FPxmHMBOm8JQi2BsHAHFsy27E0EolP8rp0NzXsJ+jNPyDWvOJZPA==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/unist-util-position/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"license": "MIT"
+		},
+		"node_modules/unist-util-remove-position": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/unist-util-remove-position/-/unist-util-remove-position-5.0.0.tgz",
+			"integrity": "sha512-Hp5Kh3wLxv0PHj9m2yZhhLt58KzPtEYKQQ4yxfYFEO7EvHwzyDYnduhHnY1mDxoqr7VUwVuHXk9RXKIiYS1N8Q==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0",
+				"unist-util-visit": "^5.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/unist-util-remove-position/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/unist-util-stringify-position": {
+			"version": "2.0.3",
+			"resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-2.0.3.tgz",
+			"integrity": "sha512-3faScn5I+hy9VleOq/qNbAd6pAx7iH5jYBMS9I1HgQVijz/4mv5Bvw5iw1sC/90CODiKo81G/ps8AJrISn687g==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^2.0.2"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/unist-util-visit": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-5.0.0.tgz",
+			"integrity": "sha512-MR04uvD+07cwl/yhVuVWAtw+3GOR/knlL55Nd/wAdblk27GCVt3lqpTivy/tkJcZoNPzTwS1Y+KMojlLDhoTzg==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0",
+				"unist-util-is": "^6.0.0",
+				"unist-util-visit-parents": "^6.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/unist-util-visit-parents": {
+			"version": "6.0.1",
+			"resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-6.0.1.tgz",
+			"integrity": "sha512-L/PqWzfTP9lzzEa6CKs0k2nARxTdZduw3zyh8d2NVBnsyvHjSX4TWse388YrrQKbvI8w20fGjGlhgT96WwKykw==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0",
+				"unist-util-is": "^6.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/unist-util-visit-parents/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"license": "MIT"
+		},
+		"node_modules/unist-util-visit/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"license": "MIT"
+		},
+		"node_modules/universalify": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz",
+			"integrity": "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">= 10.0.0"
+			}
+		},
+		"node_modules/unplugin": {
+			"version": "2.3.10",
+			"resolved": "https://registry.npmjs.org/unplugin/-/unplugin-2.3.10.tgz",
+			"integrity": "sha512-6NCPkv1ClwH+/BGE9QeoTIl09nuiAt0gS28nn1PvYXsGKRwM2TCbFA2QiilmehPDTXIe684k4rZI1yl3A1PCUw==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@jridgewell/remapping": "^2.3.5",
+				"acorn": "^8.15.0",
+				"picomatch": "^4.0.3",
+				"webpack-virtual-modules": "^0.6.2"
+			},
+			"engines": {
+				"node": ">=18.12.0"
+			}
+		},
+		"node_modules/uri-js": {
+			"version": "4.4.1",
+			"resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
+			"integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
+			"dev": true,
+			"license": "BSD-2-Clause",
+			"dependencies": {
+				"punycode": "^2.1.0"
+			}
+		},
+		"node_modules/url-join": {
+			"version": "4.0.1",
+			"resolved": "https://registry.npmjs.org/url-join/-/url-join-4.0.1.tgz",
+			"integrity": "sha512-jk1+QP6ZJqyOiuEI9AEWQfju/nB2Pw466kbA0LEZljHwKeMgd9WrAEgEGxjPDD2+TNbbb37rTyhEfrCXfuKXnA==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/util-deprecate": {
+			"version": "1.0.2",
+			"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
+			"integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/uuid": {
+			"version": "13.0.0",
+			"resolved": "https://registry.npmjs.org/uuid/-/uuid-13.0.0.tgz",
+			"integrity": "sha512-XQegIaBTVUjSHliKqcnFqYypAd4S+WCYt5NIeRs6w/UAry7z8Y9j5ZwRRL4kzq9U3sD6v+85er9FvkEaBpji2w==",
+			"dev": true,
+			"funding": [
+				"https://github.com/sponsors/broofa",
+				"https://github.com/sponsors/ctavan"
+			],
+			"license": "MIT",
+			"bin": {
+				"uuid": "dist-node/bin/uuid"
+			}
+		},
+		"node_modules/vfile": {
+			"version": "6.0.3",
+			"resolved": "https://registry.npmjs.org/vfile/-/vfile-6.0.3.tgz",
+			"integrity": "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0",
+				"vfile-message": "^4.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/vfile-location": {
+			"version": "5.0.3",
+			"resolved": "https://registry.npmjs.org/vfile-location/-/vfile-location-5.0.3.tgz",
+			"integrity": "sha512-5yXvWDEgqeiYiBe1lbxYF7UMAIm/IcopxMHrMQDq3nvKcjPKIhZklUKL+AE7J7uApI4kwe2snsK+eI6UTj9EHg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0",
+				"vfile": "^6.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/vfile-location/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/vfile-message": {
+			"version": "2.0.4",
+			"resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-2.0.4.tgz",
+			"integrity": "sha512-DjssxRGkMvifUOJre00juHoP9DPWuzjxKuMDrhNbk2TdaYYBNMStsNhEOt3idrtI12VQYM/1+iM0KOzXi4pxwQ==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^2.0.0",
+				"unist-util-stringify-position": "^2.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/vfile/node_modules/@types/unist": {
+			"version": "3.0.3",
+			"resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+			"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+			"license": "MIT"
+		},
+		"node_modules/vfile/node_modules/unist-util-stringify-position": {
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz",
+			"integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/vfile/node_modules/vfile-message": {
+			"version": "4.0.2",
+			"resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.2.tgz",
+			"integrity": "sha512-jRDZ1IMLttGj41KcZvlrYAaI3CfqpLpfpf+Mfig13viT6NKvRzWZ+lXz0Y5D60w6uJIBAOGq9mSHf0gktF0duw==",
+			"license": "MIT",
+			"dependencies": {
+				"@types/unist": "^3.0.0",
+				"unist-util-stringify-position": "^4.0.0"
+			},
+			"funding": {
+				"type": "opencollective",
+				"url": "https://opencollective.com/unified"
+			}
+		},
+		"node_modules/vite": {
+			"version": "7.2.2",
+			"resolved": "https://registry.npmjs.org/vite/-/vite-7.2.2.tgz",
+			"integrity": "sha512-BxAKBWmIbrDgrokdGZH1IgkIk/5mMHDreLDmCJ0qpyJaAteP8NvMhkwr/ZCQNqNH97bw/dANTE9PDzqwJghfMQ==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"esbuild": "^0.25.0",
+				"fdir": "^6.5.0",
+				"picomatch": "^4.0.3",
+				"postcss": "^8.5.6",
+				"rollup": "^4.43.0",
+				"tinyglobby": "^0.2.15"
+			},
+			"bin": {
+				"vite": "bin/vite.js"
+			},
+			"engines": {
+				"node": "^20.19.0 || >=22.12.0"
+			},
+			"funding": {
+				"url": "https://github.com/vitejs/vite?sponsor=1"
+			},
+			"optionalDependencies": {
+				"fsevents": "~2.3.3"
+			},
+			"peerDependencies": {
+				"@types/node": "^20.19.0 || >=22.12.0",
+				"jiti": ">=1.21.0",
+				"less": "^4.0.0",
+				"lightningcss": "^1.21.0",
+				"sass": "^1.70.0",
+				"sass-embedded": "^1.70.0",
+				"stylus": ">=0.54.8",
+				"sugarss": "^5.0.0",
+				"terser": "^5.16.0",
+				"tsx": "^4.8.1",
+				"yaml": "^2.4.2"
+			},
+			"peerDependenciesMeta": {
+				"@types/node": {
+					"optional": true
+				},
+				"jiti": {
+					"optional": true
+				},
+				"less": {
+					"optional": true
+				},
+				"lightningcss": {
+					"optional": true
+				},
+				"sass": {
+					"optional": true
+				},
+				"sass-embedded": {
+					"optional": true
+				},
+				"stylus": {
+					"optional": true
+				},
+				"sugarss": {
+					"optional": true
+				},
+				"terser": {
+					"optional": true
+				},
+				"tsx": {
+					"optional": true
+				},
+				"yaml": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/vite-node": {
+			"version": "3.2.4",
+			"resolved": "https://registry.npmjs.org/vite-node/-/vite-node-3.2.4.tgz",
+			"integrity": "sha512-EbKSKh+bh1E1IFxeO0pg1n4dvoOTt0UDiXMd/qn++r98+jPO1xtJilvXldeuQ8giIB5IkpjCgMleHMNEsGH6pg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"cac": "^6.7.14",
+				"debug": "^4.4.1",
+				"es-module-lexer": "^1.7.0",
+				"pathe": "^2.0.3",
+				"vite": "^5.0.0 || ^6.0.0 || ^7.0.0-0"
+			},
+			"bin": {
+				"vite-node": "vite-node.mjs"
+			},
+			"engines": {
+				"node": "^18.0.0 || ^20.0.0 || >=22.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/vitest"
+			}
+		},
+		"node_modules/vite-plugin-devtools-json": {
+			"version": "0.2.1",
+			"resolved": "https://registry.npmjs.org/vite-plugin-devtools-json/-/vite-plugin-devtools-json-0.2.1.tgz",
+			"integrity": "sha512-5aiNvf/iLTuLR1dUqoI5CLLGgeK2hd6u+tA+RIp7GUZDyAcM6ECaUEWOOtGpidbcxbkKq++KtmSqA3jhMbPwMA==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"uuid": "^11.1.0"
+			},
+			"peerDependencies": {
+				"vite": "^2.7.0 || ^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0"
+			}
+		},
+		"node_modules/vite-plugin-devtools-json/node_modules/uuid": {
+			"version": "11.1.0",
+			"resolved": "https://registry.npmjs.org/uuid/-/uuid-11.1.0.tgz",
+			"integrity": "sha512-0/A9rDy9P7cJ+8w1c9WD9V//9Wj15Ce2MPz8Ri6032usz+NfePxx5AcN3bN+r6ZL6jEo066/yNYB3tn4pQEx+A==",
+			"dev": true,
+			"funding": [
+				"https://github.com/sponsors/broofa",
+				"https://github.com/sponsors/ctavan"
+			],
+			"license": "MIT",
+			"bin": {
+				"uuid": "dist/esm/bin/uuid"
+			}
+		},
+		"node_modules/vite/node_modules/fsevents": {
+			"version": "2.3.3",
+			"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
+			"integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
+			"dev": true,
+			"hasInstallScript": true,
+			"license": "MIT",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+			}
+		},
+		"node_modules/vitefu": {
+			"version": "1.1.1",
+			"resolved": "https://registry.npmjs.org/vitefu/-/vitefu-1.1.1.tgz",
+			"integrity": "sha512-B/Fegf3i8zh0yFbpzZ21amWzHmuNlLlmJT6n7bu5e+pCHUKQIfXSYokrqOBGEMMe9UG2sostKQF9mml/vYaWJQ==",
+			"dev": true,
+			"license": "MIT",
+			"workspaces": [
+				"tests/deps/*",
+				"tests/projects/*",
+				"tests/projects/workspace/packages/*"
+			],
+			"peerDependencies": {
+				"vite": "^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0-beta.0"
+			},
+			"peerDependenciesMeta": {
+				"vite": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/vitest": {
+			"version": "3.2.4",
+			"resolved": "https://registry.npmjs.org/vitest/-/vitest-3.2.4.tgz",
+			"integrity": "sha512-LUCP5ev3GURDysTWiP47wRRUpLKMOfPh+yKTx3kVIEiu5KOMeqzpnYNsKyOoVrULivR8tLcks4+lga33Whn90A==",
+			"dev": true,
+			"license": "MIT",
+			"peer": true,
+			"dependencies": {
+				"@types/chai": "^5.2.2",
+				"@vitest/expect": "3.2.4",
+				"@vitest/mocker": "3.2.4",
+				"@vitest/pretty-format": "^3.2.4",
+				"@vitest/runner": "3.2.4",
+				"@vitest/snapshot": "3.2.4",
+				"@vitest/spy": "3.2.4",
+				"@vitest/utils": "3.2.4",
+				"chai": "^5.2.0",
+				"debug": "^4.4.1",
+				"expect-type": "^1.2.1",
+				"magic-string": "^0.30.17",
+				"pathe": "^2.0.3",
+				"picomatch": "^4.0.2",
+				"std-env": "^3.9.0",
+				"tinybench": "^2.9.0",
+				"tinyexec": "^0.3.2",
+				"tinyglobby": "^0.2.14",
+				"tinypool": "^1.1.1",
+				"tinyrainbow": "^2.0.0",
+				"vite": "^5.0.0 || ^6.0.0 || ^7.0.0-0",
+				"vite-node": "3.2.4",
+				"why-is-node-running": "^2.3.0"
+			},
+			"bin": {
+				"vitest": "vitest.mjs"
+			},
+			"engines": {
+				"node": "^18.0.0 || ^20.0.0 || >=22.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/vitest"
+			},
+			"peerDependencies": {
+				"@edge-runtime/vm": "*",
+				"@types/debug": "^4.1.12",
+				"@types/node": "^18.0.0 || ^20.0.0 || >=22.0.0",
+				"@vitest/browser": "3.2.4",
+				"@vitest/ui": "3.2.4",
+				"happy-dom": "*",
+				"jsdom": "*"
+			},
+			"peerDependenciesMeta": {
+				"@edge-runtime/vm": {
+					"optional": true
+				},
+				"@types/debug": {
+					"optional": true
+				},
+				"@types/node": {
+					"optional": true
+				},
+				"@vitest/browser": {
+					"optional": true
+				},
+				"@vitest/ui": {
+					"optional": true
+				},
+				"happy-dom": {
+					"optional": true
+				},
+				"jsdom": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/vitest-browser-svelte": {
+			"version": "0.1.0",
+			"resolved": "https://registry.npmjs.org/vitest-browser-svelte/-/vitest-browser-svelte-0.1.0.tgz",
+			"integrity": "sha512-YB6ZUZZQNqU1T9NzvTEDpwpPv35Ng1NZMPBh81zDrLEdOgROGE6nJb79NWb1Eu/a8VkHifqArpOZfJfALge6xQ==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": "^18.0.0 || >=20.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/vitest"
+			},
+			"peerDependencies": {
+				"@vitest/browser": "^2.1.0 || ^3.0.0-0",
+				"svelte": ">3.0.0",
+				"vitest": "^2.1.0 || ^3.0.0-0"
+			}
+		},
+		"node_modules/web-namespaces": {
+			"version": "2.0.1",
+			"resolved": "https://registry.npmjs.org/web-namespaces/-/web-namespaces-2.0.1.tgz",
+			"integrity": "sha512-bKr1DkiNa2krS7qxNtdrtHAmzuYGFQLiQ13TsorsdT6ULTkPLKuu5+GsFpDlg6JFjUTwX2DyhMPG2be8uPrqsQ==",
+			"dev": true,
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		},
+		"node_modules/webpack-virtual-modules": {
+			"version": "0.6.2",
+			"resolved": "https://registry.npmjs.org/webpack-virtual-modules/-/webpack-virtual-modules-0.6.2.tgz",
+			"integrity": "sha512-66/V2i5hQanC51vBQKPH4aI8NMAcBW59FVBs+rC7eGHupMyfn34q7rZIE+ETlJ+XTevqfUhVVBgSUNSW2flEUQ==",
+			"dev": true,
+			"license": "MIT"
+		},
+		"node_modules/whatwg-encoding": {
+			"version": "2.0.0",
+			"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-2.0.0.tgz",
+			"integrity": "sha512-p41ogyeMUrw3jWclHWTQg1k05DSVXPLcVxRTYsXUk+ZooOCZLcoYgPZ/HL/D/N+uQPOtcp1me1WhBEaX02mhWg==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"iconv-lite": "0.6.3"
+			},
+			"engines": {
+				"node": ">=12"
+			}
+		},
+		"node_modules/which": {
+			"version": "2.0.2",
+			"resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
+			"integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
+			"dev": true,
+			"license": "ISC",
+			"dependencies": {
+				"isexe": "^2.0.0"
+			},
+			"bin": {
+				"node-which": "bin/node-which"
+			},
+			"engines": {
+				"node": ">= 8"
+			}
+		},
+		"node_modules/why-is-node-running": {
+			"version": "2.3.0",
+			"resolved": "https://registry.npmjs.org/why-is-node-running/-/why-is-node-running-2.3.0.tgz",
+			"integrity": "sha512-hUrmaWBdVDcxvYqnyh09zunKzROWjbZTiNy8dBEjkS7ehEDQibXJ7XvlmtbwuTclUiIyN+CyXQD4Vmko8fNm8w==",
+			"dev": true,
+			"license": "MIT",
+			"dependencies": {
+				"siginfo": "^2.0.0",
+				"stackback": "0.0.2"
+			},
+			"bin": {
+				"why-is-node-running": "cli.js"
+			},
+			"engines": {
+				"node": ">=8"
+			}
+		},
+		"node_modules/word-wrap": {
+			"version": "1.2.5",
+			"resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.5.tgz",
+			"integrity": "sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=0.10.0"
+			}
+		},
+		"node_modules/ws": {
+			"version": "8.18.3",
+			"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
+			"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=10.0.0"
+			},
+			"peerDependencies": {
+				"bufferutil": "^4.0.1",
+				"utf-8-validate": ">=5.0.2"
+			},
+			"peerDependenciesMeta": {
+				"bufferutil": {
+					"optional": true
+				},
+				"utf-8-validate": {
+					"optional": true
+				}
+			}
+		},
+		"node_modules/yallist": {
+			"version": "5.0.0",
+			"resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz",
+			"integrity": "sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==",
+			"dev": true,
+			"license": "BlueOak-1.0.0",
+			"engines": {
+				"node": ">=18"
+			}
+		},
+		"node_modules/yocto-queue": {
+			"version": "0.1.0",
+			"resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz",
+			"integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==",
+			"dev": true,
+			"license": "MIT",
+			"engines": {
+				"node": ">=10"
+			},
+			"funding": {
+				"url": "https://github.com/sponsors/sindresorhus"
+			}
+		},
+		"node_modules/zimmerframe": {
+			"version": "1.1.2",
+			"resolved": "https://registry.npmjs.org/zimmerframe/-/zimmerframe-1.1.2.tgz",
+			"integrity": "sha512-rAbqEGa8ovJy4pyBxZM70hg4pE6gDgaQ0Sl9M3enG3I0d6H4XSAM3GeNGLKnsBpuijUow064sf7ww1nutC5/3w==",
+			"license": "MIT"
+		},
+		"node_modules/zwitch": {
+			"version": "2.0.4",
+			"resolved": "https://registry.npmjs.org/zwitch/-/zwitch-2.0.4.tgz",
+			"integrity": "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==",
+			"license": "MIT",
+			"funding": {
+				"type": "github",
+				"url": "https://github.com/sponsors/wooorm"
+			}
+		}
+	}
+}
diff --git a/llama.cpp/tools/server/webui/package.json b/llama.cpp/tools/server/webui/package.json
new file mode 100644
index 0000000..a361ce7
--- /dev/null
+++ b/llama.cpp/tools/server/webui/package.json
@@ -0,0 +1,94 @@
+{
+	"name": "webui",
+	"private": true,
+	"version": "1.0.0",
+	"type": "module",
+	"scripts": {
+		"dev": "bash scripts/dev.sh",
+		"build": "vite build && ./scripts/post-build.sh",
+		"preview": "vite preview",
+		"prepare": "svelte-kit sync || echo ''",
+		"check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json",
+		"check:watch": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json --watch",
+		"reset": "rm -rf .svelte-kit node_modules",
+		"format": "prettier --write .",
+		"lint": "prettier --check . && eslint .",
+		"test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:unit -- --run && npm run test:e2e",
+		"test:e2e": "playwright test",
+		"test:client": "vitest --project=client",
+		"test:unit": "vitest --project=unit",
+		"test:ui": "vitest --project=ui",
+		"storybook": "storybook dev -p 6006",
+		"build-storybook": "storybook build",
+		"cleanup": "rm -rf .svelte-kit build node_modules test-results"
+	},
+	"devDependencies": {
+		"@chromatic-com/storybook": "^4.1.2",
+		"@eslint/compat": "^1.2.5",
+		"@eslint/js": "^9.18.0",
+		"@internationalized/date": "^3.10.1",
+		"@lucide/svelte": "^0.515.0",
+		"@playwright/test": "^1.49.1",
+		"@storybook/addon-a11y": "^10.0.7",
+		"@storybook/addon-docs": "^10.0.7",
+		"@storybook/addon-svelte-csf": "^5.0.10",
+		"@storybook/addon-vitest": "^10.0.7",
+		"@storybook/sveltekit": "^10.0.7",
+		"@sveltejs/adapter-static": "^3.0.10",
+		"@sveltejs/kit": "^2.48.4",
+		"@sveltejs/vite-plugin-svelte": "^6.2.1",
+		"@tailwindcss/forms": "^0.5.9",
+		"@tailwindcss/typography": "^0.5.15",
+		"@tailwindcss/vite": "^4.0.0",
+		"@types/node": "^22",
+		"@vitest/browser": "^3.2.3",
+		"bits-ui": "^2.14.4",
+		"clsx": "^2.1.1",
+		"dexie": "^4.0.11",
+		"eslint": "^9.18.0",
+		"eslint-config-prettier": "^10.0.1",
+		"eslint-plugin-storybook": "^10.0.7",
+		"eslint-plugin-svelte": "^3.0.0",
+		"fflate": "^0.8.2",
+		"globals": "^16.0.0",
+		"http-server": "^14.1.1",
+		"mdast": "^3.0.0",
+		"mdsvex": "^0.12.3",
+		"playwright": "^1.56.1",
+		"prettier": "^3.4.2",
+		"prettier-plugin-svelte": "^3.3.3",
+		"prettier-plugin-tailwindcss": "^0.6.11",
+		"rehype-katex": "^7.0.1",
+		"remark-math": "^6.0.0",
+		"sass": "^1.93.3",
+		"storybook": "^10.0.7",
+		"svelte": "^5.38.2",
+		"svelte-check": "^4.0.0",
+		"tailwind-merge": "^3.3.1",
+		"tailwind-variants": "^3.2.2",
+		"tailwindcss": "^4.0.0",
+		"tw-animate-css": "^1.3.5",
+		"typescript": "^5.0.0",
+		"typescript-eslint": "^8.20.0",
+		"unified": "^11.0.5",
+		"uuid": "^13.0.0",
+		"vite": "^7.2.2",
+		"vite-plugin-devtools-json": "^0.2.0",
+		"vitest": "^3.2.3",
+		"vitest-browser-svelte": "^0.1.0"
+	},
+	"dependencies": {
+		"highlight.js": "^11.11.1",
+		"mode-watcher": "^1.1.0",
+		"pdfjs-dist": "^5.4.54",
+		"rehype-highlight": "^7.0.2",
+		"rehype-stringify": "^10.0.1",
+		"remark": "^15.0.1",
+		"remark-breaks": "^4.0.0",
+		"remark-gfm": "^4.0.1",
+		"remark-html": "^16.0.1",
+		"remark-rehype": "^11.1.2",
+		"svelte-sonner": "^1.0.5",
+		"unist-util-visit": "^5.0.0"
+	}
+}
diff --git a/llama.cpp/tools/server/webui/playwright.config.ts b/llama.cpp/tools/server/webui/playwright.config.ts
new file mode 100644
index 0000000..26d3be5
--- /dev/null
+++ b/llama.cpp/tools/server/webui/playwright.config.ts
@@ -0,0 +1,11 @@
+import { defineConfig } from '@playwright/test';
+
+export default defineConfig({
+	webServer: {
+		command: 'npm run build && http-server ../public -p 8181',
+		port: 8181,
+		timeout: 120000,
+		reuseExistingServer: false
+	},
+	testDir: 'tests/e2e'
+});
diff --git a/llama.cpp/tools/server/webui/scripts/dev.sh b/llama.cpp/tools/server/webui/scripts/dev.sh
new file mode 100644
index 0000000..b7539c2
--- /dev/null
+++ b/llama.cpp/tools/server/webui/scripts/dev.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Development script for llama.cpp webui
+# 
+# This script starts the webui development servers (Storybook and Vite).
+# Note: You need to start llama-server separately.
+#
+# Usage:
+#   bash scripts/dev.sh
+#   npm run dev
+
+cd ../../../
+
+# Check and install git hooks if missing
+check_and_install_hooks() {
+    local hooks_missing=false
+    
+    # Check for required hooks
+    if [ ! -f ".git/hooks/pre-commit" ] || [ ! -f ".git/hooks/pre-push" ] || [ ! -f ".git/hooks/post-push" ]; then
+        hooks_missing=true
+    fi
+    
+    if [ "$hooks_missing" = true ]; then
+        echo "🔧 Git hooks missing, installing them..."
+        cd tools/server/webui
+        if bash scripts/install-git-hooks.sh; then
+            echo "✅ Git hooks installed successfully"
+        else
+            echo "⚠️  Failed to install git hooks, continuing anyway..."
+        fi
+        cd ../../../
+    else
+        echo "✅ Git hooks already installed"
+    fi
+}
+
+# Install git hooks if needed
+check_and_install_hooks
+
+# Cleanup function
+cleanup() {
+    echo "🧹 Cleaning up..."
+    exit
+}
+
+# Set up signal handlers
+trap cleanup SIGINT SIGTERM
+
+echo "🚀 Starting development servers..."
+echo "📝 Note: Make sure to start llama-server separately if needed"
+cd tools/server/webui
+# Use --insecure-http-parser to handle malformed HTTP responses from llama-server
+# (some responses have both Content-Length and Transfer-Encoding headers)
+storybook dev -p 6006 --ci & NODE_OPTIONS="--insecure-http-parser" vite dev --host 0.0.0.0 &
+
+# Wait for all background processes
+wait
diff --git a/llama.cpp/tools/server/webui/scripts/install-git-hooks.sh b/llama.cpp/tools/server/webui/scripts/install-git-hooks.sh
new file mode 100755
index 0000000..d14e281
--- /dev/null
+++ b/llama.cpp/tools/server/webui/scripts/install-git-hooks.sh
@@ -0,0 +1,202 @@
+#!/bin/bash
+
+# Script to install pre-commit and pre-push hooks for webui
+# Pre-commit: formats code and runs checks
+# Pre-push: builds the project, stashes unstaged changes
+
+REPO_ROOT=$(git rev-parse --show-toplevel)
+PRE_COMMIT_HOOK="$REPO_ROOT/.git/hooks/pre-commit"
+PRE_PUSH_HOOK="$REPO_ROOT/.git/hooks/pre-push"
+
+echo "Installing pre-commit and pre-push hooks for webui..."
+
+# Create the pre-commit hook
+cat > "$PRE_COMMIT_HOOK" << 'EOF'
+#!/bin/bash
+
+# Check if there are any changes in the webui directory
+if git diff --cached --name-only | grep -q "^tools/server/webui/"; then
+    echo "Formatting and checking webui code..."
+    
+    # Change to webui directory and run format
+    cd tools/server/webui
+    
+    # Check if npm is available and package.json exists
+    if [ ! -f "package.json" ]; then
+        echo "Error: package.json not found in tools/server/webui"
+        exit 1
+    fi
+    
+    # Run the format command
+    npm run format
+
+    # Check if format command succeeded
+    if [ $? -ne 0 ]; then
+        echo "Error: npm run format failed"
+        exit 1
+    fi
+
+    # Run the lint command
+    npm run lint
+    
+    # Check if lint command succeeded
+    if [ $? -ne 0 ]; then
+        echo "Error: npm run lint failed"
+        exit 1
+    fi
+
+    # Run the check command
+    npm run check
+    
+    # Check if check command succeeded
+    if [ $? -ne 0 ]; then
+        echo "Error: npm run check failed"
+        exit 1
+    fi
+
+    # Go back to repo root
+    cd ../../..
+    
+    echo "✅ Webui code formatted and checked successfully"
+fi
+
+exit 0
+EOF
+
+# Create the pre-push hook
+cat > "$PRE_PUSH_HOOK" << 'EOF'
+#!/bin/bash
+
+# Check if there are any webui changes that need building
+WEBUI_CHANGES=$(git diff --name-only @{push}..HEAD | grep "^tools/server/webui/" || true)
+
+if [ -n "$WEBUI_CHANGES" ]; then
+    echo "Webui changes detected, checking if build is up-to-date..."
+    
+    # Change to webui directory
+    cd tools/server/webui
+    
+    # Check if npm is available and package.json exists
+    if [ ! -f "package.json" ]; then
+        echo "Error: package.json not found in tools/server/webui"
+        exit 1
+    fi
+    
+    # Check if build output exists and is newer than source files
+    BUILD_FILE="../public/index.html.gz"
+    NEEDS_BUILD=false
+    
+    if [ ! -f "$BUILD_FILE" ]; then
+        echo "Build output not found, building..."
+        NEEDS_BUILD=true
+    else
+        # Check if any source files are newer than the build output
+        if find src -newer "$BUILD_FILE" -type f | head -1 | grep -q .; then
+            echo "Source files are newer than build output, rebuilding..."
+            NEEDS_BUILD=true
+        fi
+    fi
+    
+    if [ "$NEEDS_BUILD" = true ]; then
+        echo "Building webui..."
+        
+        # Stash any unstaged changes to avoid conflicts during build
+        echo "Checking for unstaged changes..."
+        if ! git diff --quiet || ! git diff --cached --quiet --diff-filter=A; then
+            echo "Stashing unstaged changes..."
+            git stash push --include-untracked -m "Pre-push hook: stashed unstaged changes"
+            STASH_CREATED=$?
+        else
+            echo "No unstaged changes to stash"
+            STASH_CREATED=1
+        fi
+        
+        # Run the build command
+        npm run build
+        
+        # Check if build command succeeded
+        if [ $? -ne 0 ]; then
+            echo "Error: npm run build failed"
+            if [ $STASH_CREATED -eq 0 ]; then
+                echo "You can restore your unstaged changes with: git stash pop"
+            fi
+            exit 1
+        fi
+
+        # Go back to repo root
+        cd ../../..
+        
+        # Check if build output was created/updated
+        if [ -f "tools/server/public/index.html.gz" ]; then
+            # Add the build output and commit it
+            git add tools/server/public/index.html.gz
+            if ! git diff --cached --quiet; then
+                echo "Committing updated build output..."
+                git commit -m "chore: update webui build output"
+                echo "✅ Build output committed successfully"
+            else
+                echo "Build output unchanged"
+            fi
+        else
+            echo "Error: Build output not found after build"
+            if [ $STASH_CREATED -eq 0 ]; then
+                echo "You can restore your unstaged changes with: git stash pop"
+            fi
+            exit 1
+        fi
+        
+        if [ $STASH_CREATED -eq 0 ]; then
+            echo "✅ Build completed. Your unstaged changes have been stashed."
+            echo "They will be automatically restored after the push."
+            # Create a marker file to indicate stash was created by pre-push hook
+            touch .git/WEBUI_PUSH_STASH_MARKER
+        fi
+    else
+        echo "✅ Build output is up-to-date"
+    fi
+    
+    echo "✅ Webui ready for push"
+fi
+
+exit 0
+EOF
+
+# Create the post-push hook (for restoring stashed changes after push)
+cat > "$REPO_ROOT/.git/hooks/post-push" << 'EOF'
+#!/bin/bash
+
+# Check if we have a stash marker from the pre-push hook
+if [ -f .git/WEBUI_PUSH_STASH_MARKER ]; then
+    echo "Restoring your unstaged changes after push..."
+    git stash pop
+    rm -f .git/WEBUI_PUSH_STASH_MARKER
+    echo "✅ Your unstaged changes have been restored."
+fi
+
+exit 0
+EOF
+
+# Make all hooks executable
+chmod +x "$PRE_COMMIT_HOOK"
+chmod +x "$PRE_PUSH_HOOK"
+chmod +x "$REPO_ROOT/.git/hooks/post-push"
+
+if [ $? -eq 0 ]; then
+    echo "✅ Git hooks installed successfully!"
+    echo "   Pre-commit: $PRE_COMMIT_HOOK"
+    echo "   Pre-push:   $PRE_PUSH_HOOK"
+    echo "   Post-push:  $REPO_ROOT/.git/hooks/post-push"
+    echo ""
+    echo "The hooks will automatically:"
+    echo "  • Format and check webui code before commits (pre-commit)"
+    echo "  • Build webui code before pushes (pre-push)"
+    echo "  • Stash unstaged changes during build process"
+    echo "  • Restore your unstaged changes after the push"
+    echo ""
+    echo "To test the hooks:"
+    echo "  • Make a change to a file in the webui directory and commit it (triggers format/check)"
+    echo "  • Push your commits to trigger the build process"
+else
+    echo "❌ Failed to make hooks executable"
+    exit 1
+fi
diff --git a/llama.cpp/tools/server/webui/scripts/post-build.sh b/llama.cpp/tools/server/webui/scripts/post-build.sh
new file mode 100755
index 0000000..a49d6cc
--- /dev/null
+++ b/llama.cpp/tools/server/webui/scripts/post-build.sh
@@ -0,0 +1,3 @@
+rm -rf ../public/_app;
+rm ../public/favicon.svg;
+rm ../public/index.html;
diff --git a/llama.cpp/tools/server/webui/src/app.css b/llama.cpp/tools/server/webui/src/app.css
new file mode 100644
index 0000000..9705040
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/app.css
@@ -0,0 +1,138 @@
+@import 'tailwindcss';
+
+@import 'tw-animate-css';
+
+@custom-variant dark (&:is(.dark *));
+
+:root {
+	--radius: 0.625rem;
+	--background: oklch(1 0 0);
+	--foreground: oklch(0.145 0 0);
+	--card: oklch(1 0 0);
+	--card-foreground: oklch(0.145 0 0);
+	--popover: oklch(1 0 0);
+	--popover-foreground: oklch(0.145 0 0);
+	--primary: oklch(0.205 0 0);
+	--primary-foreground: oklch(0.985 0 0);
+	--secondary: oklch(0.97 0 0);
+	--secondary-foreground: oklch(0.205 0 0);
+	--muted: oklch(0.97 0 0);
+	--muted-foreground: oklch(0.556 0 0);
+	--accent: oklch(0.97 0 0);
+	--accent-foreground: oklch(0.205 0 0);
+	--destructive: oklch(0.577 0.245 27.325);
+	--border: oklch(0.875 0 0);
+	--input: oklch(0.92 0 0);
+	--ring: oklch(0.708 0 0);
+	--chart-1: oklch(0.646 0.222 41.116);
+	--chart-2: oklch(0.6 0.118 184.704);
+	--chart-3: oklch(0.398 0.07 227.392);
+	--chart-4: oklch(0.828 0.189 84.429);
+	--chart-5: oklch(0.769 0.188 70.08);
+	--sidebar: oklch(0.987 0 0);
+	--sidebar-foreground: oklch(0.145 0 0);
+	--sidebar-primary: oklch(0.205 0 0);
+	--sidebar-primary-foreground: oklch(0.985 0 0);
+	--sidebar-accent: oklch(0.97 0 0);
+	--sidebar-accent-foreground: oklch(0.205 0 0);
+	--sidebar-border: oklch(0.922 0 0);
+	--sidebar-ring: oklch(0.708 0 0);
+	--code-background: oklch(0.975 0 0);
+	--code-foreground: oklch(0.145 0 0);
+	--layer-popover: 1000000;
+}
+
+.dark {
+	--background: oklch(0.16 0 0);
+	--foreground: oklch(0.985 0 0);
+	--card: oklch(0.205 0 0);
+	--card-foreground: oklch(0.985 0 0);
+	--popover: oklch(0.205 0 0);
+	--popover-foreground: oklch(0.985 0 0);
+	--primary: oklch(0.922 0 0);
+	--primary-foreground: oklch(0.205 0 0);
+	--secondary: oklch(0.269 0 0);
+	--secondary-foreground: oklch(0.985 0 0);
+	--muted: oklch(0.269 0 0);
+	--muted-foreground: oklch(0.708 0 0);
+	--accent: oklch(0.269 0 0);
+	--accent-foreground: oklch(0.985 0 0);
+	--destructive: oklch(0.704 0.191 22.216);
+	--border: oklch(1 0 0 / 30%);
+	--input: oklch(1 0 0 / 30%);
+	--ring: oklch(0.556 0 0);
+	--chart-1: oklch(0.488 0.243 264.376);
+	--chart-2: oklch(0.696 0.17 162.48);
+	--chart-3: oklch(0.769 0.188 70.08);
+	--chart-4: oklch(0.627 0.265 303.9);
+	--chart-5: oklch(0.645 0.246 16.439);
+	--sidebar: oklch(0.19 0 0);
+	--sidebar-foreground: oklch(0.985 0 0);
+	--sidebar-primary: oklch(0.488 0.243 264.376);
+	--sidebar-primary-foreground: oklch(0.985 0 0);
+	--sidebar-accent: oklch(0.269 0 0);
+	--sidebar-accent-foreground: oklch(0.985 0 0);
+	--sidebar-border: oklch(1 0 0 / 10%);
+	--sidebar-ring: oklch(0.556 0 0);
+	--code-background: oklch(0.225 0 0);
+	--code-foreground: oklch(0.875 0 0);
+}
+
+@theme inline {
+	--radius-sm: calc(var(--radius) - 4px);
+	--radius-md: calc(var(--radius) - 2px);
+	--radius-lg: var(--radius);
+	--radius-xl: calc(var(--radius) + 4px);
+	--color-background: var(--background);
+	--color-foreground: var(--foreground);
+	--color-card: var(--card);
+	--color-card-foreground: var(--card-foreground);
+	--color-popover: var(--popover);
+	--color-popover-foreground: var(--popover-foreground);
+	--color-primary: var(--primary);
+	--color-primary-foreground: var(--primary-foreground);
+	--color-secondary: var(--secondary);
+	--color-secondary-foreground: var(--secondary-foreground);
+	--color-muted: var(--muted);
+	--color-muted-foreground: var(--muted-foreground);
+	--color-accent: var(--accent);
+	--color-accent-foreground: var(--accent-foreground);
+	--color-destructive: var(--destructive);
+	--color-border: var(--border);
+	--color-input: var(--input);
+	--color-ring: var(--ring);
+	--color-chart-1: var(--chart-1);
+	--color-chart-2: var(--chart-2);
+	--color-chart-3: var(--chart-3);
+	--color-chart-4: var(--chart-4);
+	--color-chart-5: var(--chart-5);
+	--color-sidebar: var(--sidebar);
+	--color-sidebar-foreground: var(--sidebar-foreground);
+	--color-sidebar-primary: var(--sidebar-primary);
+	--color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
+	--color-sidebar-accent: var(--sidebar-accent);
+	--color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
+	--color-sidebar-border: var(--sidebar-border);
+	--color-sidebar-ring: var(--sidebar-ring);
+}
+
+@layer base {
+	* {
+		@apply border-border outline-ring/50;
+	}
+	body {
+		@apply bg-background text-foreground;
+	}
+}
+
+@layer utilities {
+	.scrollbar-hide {
+		/* Hide scrollbar for Chrome, Safari and Opera */
+		&::-webkit-scrollbar {
+			display: none;
+		}
+		/* Hide scrollbar for IE, Edge and Firefox */
+		-ms-overflow-style: none;
+		scrollbar-width: none;
+	}
+}
diff --git a/llama.cpp/tools/server/webui/src/app.d.ts b/llama.cpp/tools/server/webui/src/app.d.ts
new file mode 100644
index 0000000..73287d9
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/app.d.ts
@@ -0,0 +1,133 @@
+// See https://svelte.dev/docs/kit/types#app.d.ts
+// for information about these interfaces
+
+// Import chat types from dedicated module
+
+import type {
+	// API types
+	ApiChatCompletionRequest,
+	ApiChatCompletionResponse,
+	ApiChatCompletionStreamChunk,
+	ApiChatCompletionToolCall,
+	ApiChatCompletionToolCallDelta,
+	ApiChatMessageData,
+	ApiChatMessageContentPart,
+	ApiContextSizeError,
+	ApiErrorResponse,
+	ApiLlamaCppServerProps,
+	ApiModelDataEntry,
+	ApiModelListResponse,
+	ApiProcessingState,
+	ApiRouterModelMeta,
+	ApiRouterModelsLoadRequest,
+	ApiRouterModelsLoadResponse,
+	ApiRouterModelsStatusRequest,
+	ApiRouterModelsStatusResponse,
+	ApiRouterModelsListResponse,
+	ApiRouterModelsUnloadRequest,
+	ApiRouterModelsUnloadResponse,
+	// Chat types
+	ChatAttachmentDisplayItem,
+	ChatAttachmentPreviewItem,
+	ChatMessageType,
+	ChatRole,
+	ChatUploadedFile,
+	ChatMessageSiblingInfo,
+	ChatMessagePromptProgress,
+	ChatMessageTimings,
+	// Database types
+	DatabaseConversation,
+	DatabaseMessage,
+	DatabaseMessageExtra,
+	DatabaseMessageExtraAudioFile,
+	DatabaseMessageExtraImageFile,
+	DatabaseMessageExtraTextFile,
+	DatabaseMessageExtraPdfFile,
+	DatabaseMessageExtraLegacyContext,
+	ExportedConversation,
+	ExportedConversations,
+	// Model types
+	ModelModalities,
+	ModelOption,
+	// Settings types
+	SettingsChatServiceOptions,
+	SettingsConfigValue,
+	SettingsFieldConfig,
+	SettingsConfigType
+} from '$lib/types';
+
+import { ServerRole, ServerModelStatus, ModelModality } from '$lib/enums';
+
+declare global {
+	// namespace App {
+	// interface Error {}
+	// interface Locals {}
+	// interface PageData {}
+	// interface PageState {}
+	// interface Platform {}
+	// }
+
+	export {
+		// API types
+		ApiChatCompletionRequest,
+		ApiChatCompletionResponse,
+		ApiChatCompletionStreamChunk,
+		ApiChatCompletionToolCall,
+		ApiChatCompletionToolCallDelta,
+		ApiChatMessageData,
+		ApiChatMessageContentPart,
+		ApiContextSizeError,
+		ApiErrorResponse,
+		ApiLlamaCppServerProps,
+		ApiModelDataEntry,
+		ApiModelListResponse,
+		ApiProcessingState,
+		ApiRouterModelMeta,
+		ApiRouterModelsLoadRequest,
+		ApiRouterModelsLoadResponse,
+		ApiRouterModelsStatusRequest,
+		ApiRouterModelsStatusResponse,
+		ApiRouterModelsListResponse,
+		ApiRouterModelsUnloadRequest,
+		ApiRouterModelsUnloadResponse,
+		// Chat types
+		ChatAttachmentDisplayItem,
+		ChatAttachmentPreviewItem,
+		ChatMessagePromptProgress,
+		ChatMessageSiblingInfo,
+		ChatMessageTimings,
+		ChatMessageType,
+		ChatRole,
+		ChatUploadedFile,
+		// Database types
+		DatabaseConversation,
+		DatabaseMessage,
+		DatabaseMessageExtra,
+		DatabaseMessageExtraAudioFile,
+		DatabaseMessageExtraImageFile,
+		DatabaseMessageExtraTextFile,
+		DatabaseMessageExtraPdfFile,
+		DatabaseMessageExtraLegacyContext,
+		ExportedConversation,
+		ExportedConversations,
+		// Enum types
+		ModelModality,
+		ServerRole,
+		ServerModelStatus,
+		// Model types
+		ModelModalities,
+		ModelOption,
+		// Settings types
+		SettingsChatServiceOptions,
+		SettingsConfigValue,
+		SettingsFieldConfig,
+		SettingsConfigType
+	};
+}
+
+declare global {
+	interface Window {
+		idxThemeStyle?: number;
+		idxCodeBlock?: number;
+	}
+}
diff --git a/llama.cpp/tools/server/webui/src/app.html b/llama.cpp/tools/server/webui/src/app.html
new file mode 100644
index 0000000..1391f88
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/app.html
@@ -0,0 +1,12 @@
+<!doctype html>
+<html lang="en">
+	<head>
+		<meta charset="utf-8" />
+		<link rel="icon" href="%sveltekit.assets%/favicon.svg" />
+		<meta name="viewport" content="width=device-width, initial-scale=1" />
+		%sveltekit.head%
+	</head>
+	<body data-sveltekit-preload-data="hover">
+		<div style="display: contents">%sveltekit.body%</div>
+	</body>
+</html>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreview.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreview.svelte
new file mode 100644
index 0000000..0b0bf52
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreview.svelte
@@ -0,0 +1,283 @@
+<script lang="ts">
+	import { Button } from '$lib/components/ui/button';
+	import * as Alert from '$lib/components/ui/alert';
+	import { SyntaxHighlightedCode } from '$lib/components/app';
+	import { FileText, Image, Music, FileIcon, Eye, Info } from '@lucide/svelte';
+	import {
+		isTextFile,
+		isImageFile,
+		isPdfFile,
+		isAudioFile,
+		getLanguageFromFilename
+	} from '$lib/utils';
+	import { convertPDFToImage } from '$lib/utils/browser-only';
+	import { modelsStore } from '$lib/stores/models.svelte';
+
+	interface Props {
+		// Either an uploaded file or a stored attachment
+		uploadedFile?: ChatUploadedFile;
+		attachment?: DatabaseMessageExtra;
+		// For uploaded files
+		preview?: string;
+		name?: string;
+		textContent?: string;
+		// For checking vision modality
+		activeModelId?: string;
+	}
+
+	let { uploadedFile, attachment, preview, name, textContent, activeModelId }: Props = $props();
+
+	let hasVisionModality = $derived(
+		activeModelId ? modelsStore.modelSupportsVision(activeModelId) : false
+	);
+
+	let displayName = $derived(uploadedFile?.name || attachment?.name || name || 'Unknown File');
+
+	// Determine file type from uploaded file or attachment
+	let isAudio = $derived(isAudioFile(attachment, uploadedFile));
+	let isImage = $derived(isImageFile(attachment, uploadedFile));
+	let isPdf = $derived(isPdfFile(attachment, uploadedFile));
+	let isText = $derived(isTextFile(attachment, uploadedFile));
+
+	let displayPreview = $derived(
+		uploadedFile?.preview ||
+			(isImage && attachment && 'base64Url' in attachment ? attachment.base64Url : preview)
+	);
+
+	let displayTextContent = $derived(
+		uploadedFile?.textContent ||
+			(attachment && 'content' in attachment ? attachment.content : textContent)
+	);
+
+	let language = $derived(getLanguageFromFilename(displayName));
+
+	let IconComponent = $derived(() => {
+		if (isImage) return Image;
+		if (isText || isPdf) return FileText;
+		if (isAudio) return Music;
+
+		return FileIcon;
+	});
+
+	let pdfViewMode = $state<'text' | 'pages'>('pages');
+
+	let pdfImages = $state<string[]>([]);
+
+	let pdfImagesLoading = $state(false);
+
+	let pdfImagesError = $state<string | null>(null);
+
+	async function loadPdfImages() {
+		if (!isPdf || pdfImages.length > 0 || pdfImagesLoading) return;
+
+		pdfImagesLoading = true;
+		pdfImagesError = null;
+
+		try {
+			let file: File | null = null;
+
+			if (uploadedFile?.file) {
+				file = uploadedFile.file;
+			} else if (isPdf && attachment) {
+				// Check if we have pre-processed images
+				if (
+					'images' in attachment &&
+					attachment.images &&
+					Array.isArray(attachment.images) &&
+					attachment.images.length > 0
+				) {
+					pdfImages = attachment.images;
+					return;
+				}
+
+				// Convert base64 back to File for processing
+				if ('base64Data' in attachment && attachment.base64Data) {
+					const base64Data = attachment.base64Data;
+					const byteCharacters = atob(base64Data);
+					const byteNumbers = new Array(byteCharacters.length);
+					for (let i = 0; i < byteCharacters.length; i++) {
+						byteNumbers[i] = byteCharacters.charCodeAt(i);
+					}
+					const byteArray = new Uint8Array(byteNumbers);
+					file = new File([byteArray], displayName, { type: 'application/pdf' });
+				}
+			}
+
+			if (file) {
+				pdfImages = await convertPDFToImage(file);
+			} else {
+				throw new Error('No PDF file available for conversion');
+			}
+		} catch (error) {
+			pdfImagesError = error instanceof Error ? error.message : 'Failed to load PDF images';
+		} finally {
+			pdfImagesLoading = false;
+		}
+	}
+
+	export function reset() {
+		pdfImages = [];
+		pdfImagesLoading = false;
+		pdfImagesError = null;
+		pdfViewMode = 'pages';
+	}
+
+	$effect(() => {
+		if (isPdf && pdfViewMode === 'pages') {
+			loadPdfImages();
+		}
+	});
+</script>
+
+<div class="space-y-4">
+	<div class="flex items-center justify-end gap-6">
+		{#if isPdf}
+			<div class="flex items-center gap-2">
+				<Button
+					variant={pdfViewMode === 'text' ? 'default' : 'outline'}
+					size="sm"
+					onclick={() => (pdfViewMode = 'text')}
+					disabled={pdfImagesLoading}
+				>
+					<FileText class="mr-1 h-4 w-4" />
+
+					Text
+				</Button>
+
+				<Button
+					variant={pdfViewMode === 'pages' ? 'default' : 'outline'}
+					size="sm"
+					onclick={() => {
+						pdfViewMode = 'pages';
+						loadPdfImages();
+					}}
+					disabled={pdfImagesLoading}
+				>
+					{#if pdfImagesLoading}
+						<div
+							class="mr-1 h-4 w-4 animate-spin rounded-full border-2 border-current border-t-transparent"
+						></div>
+					{:else}
+						<Eye class="mr-1 h-4 w-4" />
+					{/if}
+
+					Pages
+				</Button>
+			</div>
+		{/if}
+	</div>
+
+	<div class="flex-1 overflow-auto">
+		{#if isImage && displayPreview}
+			<div class="flex items-center justify-center">
+				<img
+					src={displayPreview}
+					alt={displayName}
+					class="max-h-full rounded-lg object-contain shadow-lg"
+				/>
+			</div>
+		{:else if isPdf && pdfViewMode === 'pages'}
+			{#if !hasVisionModality && activeModelId}
+				<Alert.Root class="mb-4">
+					<Info class="h-4 w-4" />
+					<Alert.Title>Preview only</Alert.Title>
+					<Alert.Description>
+						<span class="inline-flex">
+							The selected model does not support vision. Only the extracted
+							<!-- svelte-ignore a11y_click_events_have_key_events -->
+							<!-- svelte-ignore a11y_no_static_element_interactions -->
+							<span class="mx-1 cursor-pointer underline" onclick={() => (pdfViewMode = 'text')}>
+								text
+							</span>
+							will be sent to the model.
+						</span>
+					</Alert.Description>
+				</Alert.Root>
+			{/if}
+
+			{#if pdfImagesLoading}
+				<div class="flex items-center justify-center p-8">
+					<div class="text-center">
+						<div
+							class="mx-auto mb-4 h-8 w-8 animate-spin rounded-full border-4 border-primary border-t-transparent"
+						></div>
+
+						<p class="text-muted-foreground">Converting PDF to images...</p>
+					</div>
+				</div>
+			{:else if pdfImagesError}
+				<div class="flex items-center justify-center p-8">
+					<div class="text-center">
+						<FileText class="mx-auto mb-4 h-16 w-16 text-muted-foreground" />
+
+						<p class="mb-4 text-muted-foreground">Failed to load PDF images</p>
+
+						<p class="text-sm text-muted-foreground">{pdfImagesError}</p>
+
+						<Button class="mt-4" onclick={() => (pdfViewMode = 'text')}>View as Text</Button>
+					</div>
+				</div>
+			{:else if pdfImages.length > 0}
+				<div class="max-h-[70vh] space-y-4 overflow-auto">
+					{#each pdfImages as image, index (image)}
+						<div class="text-center">
+							<p class="mb-2 text-sm text-muted-foreground">Page {index + 1}</p>
+
+							<img
+								src={image}
+								alt="PDF Page {index + 1}"
+								class="mx-auto max-w-full rounded-lg shadow-lg"
+							/>
+						</div>
+					{/each}
+				</div>
+			{:else}
+				<div class="flex items-center justify-center p-8">
+					<div class="text-center">
+						<FileText class="mx-auto mb-4 h-16 w-16 text-muted-foreground" />
+
+						<p class="mb-4 text-muted-foreground">No PDF pages available</p>
+					</div>
+				</div>
+			{/if}
+		{:else if (isText || (isPdf && pdfViewMode === 'text')) && displayTextContent}
+			<SyntaxHighlightedCode code={displayTextContent} {language} maxWidth="calc(69rem - 2rem)" />
+		{:else if isAudio}
+			<div class="flex items-center justify-center p-8">
+				<div class="w-full max-w-md text-center">
+					<Music class="mx-auto mb-4 h-16 w-16 text-muted-foreground" />
+
+					{#if uploadedFile?.preview}
+						<audio controls class="mb-4 w-full" src={uploadedFile.preview}>
+							Your browser does not support the audio element.
+						</audio>
+					{:else if isAudio && attachment && 'mimeType' in attachment && 'base64Data' in attachment}
+						<audio
+							controls
+							class="mb-4 w-full"
+							src={`data:${attachment.mimeType};base64,${attachment.base64Data}`}
+						>
+							Your browser does not support the audio element.
+						</audio>
+					{:else}
+						<p class="mb-4 text-muted-foreground">Audio preview not available</p>
+					{/if}
+
+					<p class="text-sm text-muted-foreground">
+						{displayName}
+					</p>
+				</div>
+			</div>
+		{:else}
+			<div class="flex items-center justify-center p-8">
+				<div class="text-center">
+					{#if IconComponent}
+						<IconComponent class="mx-auto mb-4 h-16 w-16 text-muted-foreground" />
+					{/if}
+
+					<p class="mb-4 text-muted-foreground">Preview not available for this file type</p>
+				</div>
+			</div>
+		{/if}
+	</div>
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte
new file mode 100644
index 0000000..908db58
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte
@@ -0,0 +1,165 @@
+<script lang="ts">
+	import { RemoveButton } from '$lib/components/app';
+	import { formatFileSize, getFileTypeLabel, getPreviewText, isTextFile } from '$lib/utils';
+	import { AttachmentType } from '$lib/enums';
+
+	interface Props {
+		class?: string;
+		id: string;
+		onClick?: (event?: MouseEvent) => void;
+		onRemove?: (id: string) => void;
+		name: string;
+		readonly?: boolean;
+		size?: number;
+		textContent?: string;
+		// Either uploaded file or stored attachment
+		uploadedFile?: ChatUploadedFile;
+		attachment?: DatabaseMessageExtra;
+	}
+
+	let {
+		class: className = '',
+		id,
+		onClick,
+		onRemove,
+		name,
+		readonly = false,
+		size,
+		textContent,
+		uploadedFile,
+		attachment
+	}: Props = $props();
+
+	let isText = $derived(isTextFile(attachment, uploadedFile));
+
+	let fileTypeLabel = $derived.by(() => {
+		if (uploadedFile?.type) {
+			return getFileTypeLabel(uploadedFile.type);
+		}
+
+		if (attachment) {
+			if ('mimeType' in attachment && attachment.mimeType) {
+				return getFileTypeLabel(attachment.mimeType);
+			}
+
+			if (attachment.type) {
+				return getFileTypeLabel(attachment.type);
+			}
+		}
+
+		return getFileTypeLabel(name);
+	});
+
+	let pdfProcessingMode = $derived.by(() => {
+		if (attachment?.type === AttachmentType.PDF) {
+			const pdfAttachment = attachment as DatabaseMessageExtraPdfFile;
+
+			return pdfAttachment.processedAsImages ? 'Sent as Image' : 'Sent as Text';
+		}
+		return null;
+	});
+</script>
+
+{#if isText}
+	{#if readonly}
+		<!-- Readonly mode (ChatMessage) -->
+		<button
+			class="cursor-pointer rounded-lg border border-border bg-muted p-3 transition-shadow hover:shadow-md {className} w-full max-w-2xl"
+			onclick={onClick}
+			aria-label={`Preview ${name}`}
+			type="button"
+		>
+			<div class="flex items-start gap-3">
+				<div class="flex min-w-0 flex-1 flex-col items-start text-left">
+					<span class="w-full truncate text-sm font-medium text-foreground">{name}</span>
+
+					{#if size}
+						<span class="text-xs text-muted-foreground">{formatFileSize(size)}</span>
+					{/if}
+
+					{#if textContent}
+						<div class="relative mt-2 w-full">
+							<div
+								class="overflow-hidden font-mono text-xs leading-relaxed break-words whitespace-pre-wrap text-muted-foreground"
+							>
+								{getPreviewText(textContent)}
+							</div>
+
+							{#if textContent.length > 150}
+								<div
+									class="pointer-events-none absolute right-0 bottom-0 left-0 h-6 bg-gradient-to-t from-muted to-transparent"
+								></div>
+							{/if}
+						</div>
+					{/if}
+				</div>
+			</div>
+		</button>
+	{:else}
+		<!-- Non-readonly mode (ChatForm) -->
+		<button
+			class="group relative rounded-lg border border-border bg-muted p-3 {className} {textContent
+				? 'max-h-24 max-w-72'
+				: 'max-w-36'} cursor-pointer text-left"
+			onclick={onClick}
+		>
+			<div class="absolute top-2 right-2 opacity-0 transition-opacity group-hover:opacity-100">
+				<RemoveButton {id} {onRemove} />
+			</div>
+
+			<div class="pr-8">
+				<span class="mb-3 block truncate text-sm font-medium text-foreground">{name}</span>
+
+				{#if textContent}
+					<div class="relative">
+						<div
+							class="overflow-hidden font-mono text-xs leading-relaxed break-words whitespace-pre-wrap text-muted-foreground"
+							style="max-height: 3rem; line-height: 1.2em;"
+						>
+							{getPreviewText(textContent)}
+						</div>
+
+						{#if textContent.length > 150}
+							<div
+								class="pointer-events-none absolute right-0 bottom-0 left-0 h-4 bg-gradient-to-t from-muted to-transparent"
+							></div>
+						{/if}
+					</div>
+				{/if}
+			</div>
+		</button>
+	{/if}
+{:else}
+	<button
+		class="group flex items-center gap-3 rounded-lg border border-border bg-muted p-3 {className} relative"
+		onclick={onClick}
+	>
+		<div
+			class="flex h-8 w-8 items-center justify-center rounded bg-primary/10 text-xs font-medium text-primary"
+		>
+			{fileTypeLabel}
+		</div>
+
+		<div class="flex flex-col gap-0.5">
+			<span
+				class="max-w-24 truncate text-sm font-medium text-foreground {readonly
+					? ''
+					: 'group-hover:pr-6'} md:max-w-32"
+			>
+				{name}
+			</span>
+
+			{#if pdfProcessingMode}
+				<span class="text-left text-xs text-muted-foreground">{pdfProcessingMode}</span>
+			{:else if size}
+				<span class="text-left text-xs text-muted-foreground">{formatFileSize(size)}</span>
+			{/if}
+		</div>
+
+		{#if !readonly}
+			<div class="absolute top-2 right-2 opacity-0 transition-opacity group-hover:opacity-100">
+				<RemoveButton {id} {onRemove} />
+			</div>
+		{/if}
+	</button>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailImage.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailImage.svelte
new file mode 100644
index 0000000..ba711a9
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailImage.svelte
@@ -0,0 +1,64 @@
+<script lang="ts">
+	import { RemoveButton } from '$lib/components/app';
+
+	interface Props {
+		id: string;
+		name: string;
+		preview: string;
+		readonly?: boolean;
+		onRemove?: (id: string) => void;
+		onClick?: (event?: MouseEvent) => void;
+		class?: string;
+		// Customizable size props
+		width?: string;
+		height?: string;
+		imageClass?: string;
+	}
+
+	let {
+		id,
+		name,
+		preview,
+		readonly = false,
+		onRemove,
+		onClick,
+		class: className = '',
+		// Default to small size for form previews
+		width = 'w-auto',
+		height = 'h-16',
+		imageClass = ''
+	}: Props = $props();
+</script>
+
+<div
+	class="group relative overflow-hidden rounded-lg bg-muted shadow-lg dark:border dark:border-muted {className}"
+>
+	{#if onClick}
+		<button
+			type="button"
+			class="block h-full w-full rounded-lg focus:ring-2 focus:ring-primary focus:ring-offset-2 focus:outline-none"
+			onclick={onClick}
+			aria-label="Preview {name}"
+		>
+			<img
+				src={preview}
+				alt={name}
+				class="{height} {width} cursor-pointer object-cover {imageClass}"
+			/>
+		</button>
+	{:else}
+		<img
+			src={preview}
+			alt={name}
+			class="{height} {width} cursor-pointer object-cover {imageClass}"
+		/>
+	{/if}
+
+	{#if !readonly}
+		<div
+			class="absolute top-1 right-1 flex items-center justify-center opacity-0 transition-opacity group-hover:opacity-100"
+		>
+			<RemoveButton {id} {onRemove} class="text-white" />
+		</div>
+	{/if}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList.svelte
new file mode 100644
index 0000000..a1f5af5
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList.svelte
@@ -0,0 +1,243 @@
+<script lang="ts">
+	import { ChatAttachmentThumbnailImage, ChatAttachmentThumbnailFile } from '$lib/components/app';
+	import { Button } from '$lib/components/ui/button';
+	import { ChevronLeft, ChevronRight } from '@lucide/svelte';
+	import { DialogChatAttachmentPreview, DialogChatAttachmentsViewAll } from '$lib/components/app';
+	import { getAttachmentDisplayItems } from '$lib/utils';
+
+	interface Props {
+		class?: string;
+		style?: string;
+		// For ChatMessage - stored attachments
+		attachments?: DatabaseMessageExtra[];
+		readonly?: boolean;
+		// For ChatForm - pending uploads
+		onFileRemove?: (fileId: string) => void;
+		uploadedFiles?: ChatUploadedFile[];
+		// Image size customization
+		imageClass?: string;
+		imageHeight?: string;
+		imageWidth?: string;
+		// Limit display to single row with "+ X more" button
+		limitToSingleRow?: boolean;
+		// For vision modality check
+		activeModelId?: string;
+	}
+
+	let {
+		class: className = '',
+		style = '',
+		attachments = [],
+		readonly = false,
+		onFileRemove,
+		uploadedFiles = $bindable([]),
+		// Default to small size for form previews
+		imageClass = '',
+		imageHeight = 'h-24',
+		imageWidth = 'w-auto',
+		limitToSingleRow = false,
+		activeModelId
+	}: Props = $props();
+
+	let displayItems = $derived(getAttachmentDisplayItems({ uploadedFiles, attachments }));
+
+	let canScrollLeft = $state(false);
+	let canScrollRight = $state(false);
+	let isScrollable = $state(false);
+	let previewDialogOpen = $state(false);
+	let previewItem = $state<ChatAttachmentPreviewItem | null>(null);
+	let scrollContainer: HTMLDivElement | undefined = $state();
+	let showViewAll = $derived(limitToSingleRow && displayItems.length > 0 && isScrollable);
+	let viewAllDialogOpen = $state(false);
+
+	function openPreview(item: ChatAttachmentDisplayItem, event?: MouseEvent) {
+		event?.stopPropagation();
+		event?.preventDefault();
+
+		previewItem = {
+			uploadedFile: item.uploadedFile,
+			attachment: item.attachment,
+			preview: item.preview,
+			name: item.name,
+			size: item.size,
+			textContent: item.textContent
+		};
+		previewDialogOpen = true;
+	}
+
+	function scrollLeft(event?: MouseEvent) {
+		event?.stopPropagation();
+		event?.preventDefault();
+
+		if (!scrollContainer) return;
+
+		scrollContainer.scrollBy({ left: scrollContainer.clientWidth * -0.67, behavior: 'smooth' });
+	}
+
+	function scrollRight(event?: MouseEvent) {
+		event?.stopPropagation();
+		event?.preventDefault();
+
+		if (!scrollContainer) return;
+
+		scrollContainer.scrollBy({ left: scrollContainer.clientWidth * 0.67, behavior: 'smooth' });
+	}
+
+	function updateScrollButtons() {
+		if (!scrollContainer) return;
+
+		const { scrollLeft, scrollWidth, clientWidth } = scrollContainer;
+
+		canScrollLeft = scrollLeft > 0;
+		canScrollRight = scrollLeft < scrollWidth - clientWidth - 1;
+		isScrollable = scrollWidth > clientWidth;
+	}
+
+	$effect(() => {
+		if (scrollContainer && displayItems.length) {
+			scrollContainer.scrollLeft = 0;
+
+			setTimeout(() => {
+				updateScrollButtons();
+			}, 0);
+		}
+	});
+</script>
+
+{#if displayItems.length > 0}
+	<div class={className} {style}>
+		{#if limitToSingleRow}
+			<div class="relative">
+				<button
+					class="absolute top-1/2 left-4 z-10 flex h-6 w-6 -translate-y-1/2 items-center justify-center rounded-full bg-foreground/15 shadow-md backdrop-blur-xs transition-opacity hover:bg-foreground/35 {canScrollLeft
+						? 'opacity-100'
+						: 'pointer-events-none opacity-0'}"
+					onclick={scrollLeft}
+					aria-label="Scroll left"
+				>
+					<ChevronLeft class="h-4 w-4" />
+				</button>
+
+				<div
+					class="scrollbar-hide flex items-start gap-3 overflow-x-auto"
+					bind:this={scrollContainer}
+					onscroll={updateScrollButtons}
+				>
+					{#each displayItems as item (item.id)}
+						{#if item.isImage && item.preview}
+							<ChatAttachmentThumbnailImage
+								class="flex-shrink-0 cursor-pointer {limitToSingleRow
+									? 'first:ml-4 last:mr-4'
+									: ''}"
+								id={item.id}
+								name={item.name}
+								preview={item.preview}
+								{readonly}
+								onRemove={onFileRemove}
+								height={imageHeight}
+								width={imageWidth}
+								{imageClass}
+								onClick={(event) => openPreview(item, event)}
+							/>
+						{:else}
+							<ChatAttachmentThumbnailFile
+								class="flex-shrink-0 cursor-pointer {limitToSingleRow
+									? 'first:ml-4 last:mr-4'
+									: ''}"
+								id={item.id}
+								name={item.name}
+								size={item.size}
+								{readonly}
+								onRemove={onFileRemove}
+								textContent={item.textContent}
+								attachment={item.attachment}
+								uploadedFile={item.uploadedFile}
+								onClick={(event) => openPreview(item, event)}
+							/>
+						{/if}
+					{/each}
+				</div>
+
+				<button
+					class="absolute top-1/2 right-4 z-10 flex h-6 w-6 -translate-y-1/2 items-center justify-center rounded-full bg-foreground/15 shadow-md backdrop-blur-xs transition-opacity hover:bg-foreground/35 {canScrollRight
+						? 'opacity-100'
+						: 'pointer-events-none opacity-0'}"
+					onclick={scrollRight}
+					aria-label="Scroll right"
+				>
+					<ChevronRight class="h-4 w-4" />
+				</button>
+			</div>
+
+			{#if showViewAll}
+				<div class="mt-2 -mr-2 flex justify-end px-4">
+					<Button
+						type="button"
+						variant="ghost"
+						size="sm"
+						class="h-6 text-xs text-muted-foreground hover:text-foreground"
+						onclick={() => (viewAllDialogOpen = true)}
+					>
+						View all ({displayItems.length})
+					</Button>
+				</div>
+			{/if}
+		{:else}
+			<div class="flex flex-wrap items-start justify-end gap-3">
+				{#each displayItems as item (item.id)}
+					{#if item.isImage && item.preview}
+						<ChatAttachmentThumbnailImage
+							class="cursor-pointer"
+							id={item.id}
+							name={item.name}
+							preview={item.preview}
+							{readonly}
+							onRemove={onFileRemove}
+							height={imageHeight}
+							width={imageWidth}
+							{imageClass}
+							onClick={(event) => openPreview(item, event)}
+						/>
+					{:else}
+						<ChatAttachmentThumbnailFile
+							class="cursor-pointer"
+							id={item.id}
+							name={item.name}
+							size={item.size}
+							{readonly}
+							onRemove={onFileRemove}
+							textContent={item.textContent}
+							attachment={item.attachment}
+							uploadedFile={item.uploadedFile}
+							onClick={(event?: MouseEvent) => openPreview(item, event)}
+						/>
+					{/if}
+				{/each}
+			</div>
+		{/if}
+	</div>
+{/if}
+
+{#if previewItem}
+	<DialogChatAttachmentPreview
+		bind:open={previewDialogOpen}
+		uploadedFile={previewItem.uploadedFile}
+		attachment={previewItem.attachment}
+		preview={previewItem.preview}
+		name={previewItem.name}
+		size={previewItem.size}
+		textContent={previewItem.textContent}
+		{activeModelId}
+	/>
+{/if}
+
+<DialogChatAttachmentsViewAll
+	bind:open={viewAllDialogOpen}
+	{uploadedFiles}
+	{attachments}
+	{readonly}
+	{onFileRemove}
+	imageHeight="h-64"
+	{imageClass}
+	{activeModelId}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsViewAll.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsViewAll.svelte
new file mode 100644
index 0000000..279b2e2
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsViewAll.svelte
@@ -0,0 +1,117 @@
+<script lang="ts">
+	import {
+		ChatAttachmentThumbnailImage,
+		ChatAttachmentThumbnailFile,
+		DialogChatAttachmentPreview
+	} from '$lib/components/app';
+	import { getAttachmentDisplayItems } from '$lib/utils';
+
+	interface Props {
+		uploadedFiles?: ChatUploadedFile[];
+		attachments?: DatabaseMessageExtra[];
+		readonly?: boolean;
+		onFileRemove?: (fileId: string) => void;
+		imageHeight?: string;
+		imageWidth?: string;
+		imageClass?: string;
+		activeModelId?: string;
+	}
+
+	let {
+		uploadedFiles = [],
+		attachments = [],
+		readonly = false,
+		onFileRemove,
+		imageHeight = 'h-24',
+		imageWidth = 'w-auto',
+		imageClass = '',
+		activeModelId
+	}: Props = $props();
+
+	let previewDialogOpen = $state(false);
+	let previewItem = $state<ChatAttachmentPreviewItem | null>(null);
+
+	let displayItems = $derived(getAttachmentDisplayItems({ uploadedFiles, attachments }));
+	let imageItems = $derived(displayItems.filter((item) => item.isImage));
+	let fileItems = $derived(displayItems.filter((item) => !item.isImage));
+
+	function openPreview(item: (typeof displayItems)[0], event?: Event) {
+		if (event) {
+			event.preventDefault();
+			event.stopPropagation();
+		}
+
+		previewItem = {
+			uploadedFile: item.uploadedFile,
+			attachment: item.attachment,
+			preview: item.preview,
+			name: item.name,
+			size: item.size,
+			textContent: item.textContent
+		};
+		previewDialogOpen = true;
+	}
+</script>
+
+<div class="space-y-4">
+	<div class="min-h-0 flex-1 space-y-6 overflow-y-auto px-1">
+		{#if fileItems.length > 0}
+			<div>
+				<h3 class="mb-3 text-sm font-medium text-foreground">Files ({fileItems.length})</h3>
+				<div class="flex flex-wrap items-start gap-3">
+					{#each fileItems as item (item.id)}
+						<ChatAttachmentThumbnailFile
+							class="cursor-pointer"
+							id={item.id}
+							name={item.name}
+							size={item.size}
+							{readonly}
+							onRemove={onFileRemove}
+							textContent={item.textContent}
+							attachment={item.attachment}
+							uploadedFile={item.uploadedFile}
+							onClick={(event?: MouseEvent) => openPreview(item, event)}
+						/>
+					{/each}
+				</div>
+			</div>
+		{/if}
+
+		{#if imageItems.length > 0}
+			<div>
+				<h3 class="mb-3 text-sm font-medium text-foreground">Images ({imageItems.length})</h3>
+				<div class="flex flex-wrap items-start gap-3">
+					{#each imageItems as item (item.id)}
+						{#if item.preview}
+							<ChatAttachmentThumbnailImage
+								class="cursor-pointer"
+								id={item.id}
+								name={item.name}
+								preview={item.preview}
+								{readonly}
+								onRemove={onFileRemove}
+								height={imageHeight}
+								width={imageWidth}
+								{imageClass}
+								onClick={(event) => openPreview(item, event)}
+							/>
+						{/if}
+					{/each}
+				</div>
+			</div>
+		{/if}
+	</div>
+</div>
+
+{#if previewItem}
+	<DialogChatAttachmentPreview
+		bind:open={previewDialogOpen}
+		uploadedFile={previewItem.uploadedFile}
+		attachment={previewItem.attachment}
+		preview={previewItem.preview}
+		name={previewItem.name}
+		size={previewItem.size}
+		textContent={previewItem.textContent}
+		{activeModelId}
+	/>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
new file mode 100644
index 0000000..27ab975
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@@ -0,0 +1,315 @@
+<script lang="ts">
+	import { afterNavigate } from '$app/navigation';
+	import {
+		ChatAttachmentsList,
+		ChatFormActions,
+		ChatFormFileInputInvisible,
+		ChatFormHelperText,
+		ChatFormTextarea
+	} from '$lib/components/app';
+	import { INPUT_CLASSES } from '$lib/constants/input-classes';
+	import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
+	import { config } from '$lib/stores/settings.svelte';
+	import { modelOptions, selectedModelId } from '$lib/stores/models.svelte';
+	import { isRouterMode } from '$lib/stores/server.svelte';
+	import { chatStore } from '$lib/stores/chat.svelte';
+	import { activeMessages } from '$lib/stores/conversations.svelte';
+	import { MimeTypeText } from '$lib/enums';
+	import { isIMEComposing, parseClipboardContent } from '$lib/utils';
+	import {
+		AudioRecorder,
+		convertToWav,
+		createAudioFile,
+		isAudioRecordingSupported
+	} from '$lib/utils/browser-only';
+	import { onMount } from 'svelte';
+
+	interface Props {
+		class?: string;
+		disabled?: boolean;
+		isLoading?: boolean;
+		onFileRemove?: (fileId: string) => void;
+		onFileUpload?: (files: File[]) => void;
+		onSend?: (message: string, files?: ChatUploadedFile[]) => Promise<boolean>;
+		onStop?: () => void;
+		showHelperText?: boolean;
+		uploadedFiles?: ChatUploadedFile[];
+	}
+
+	let {
+		class: className,
+		disabled = false,
+		isLoading = false,
+		onFileRemove,
+		onFileUpload,
+		onSend,
+		onStop,
+		showHelperText = true,
+		uploadedFiles = $bindable([])
+	}: Props = $props();
+
+	let audioRecorder: AudioRecorder | undefined;
+	let chatFormActionsRef: ChatFormActions | undefined = $state(undefined);
+	let currentConfig = $derived(config());
+	let fileInputRef: ChatFormFileInputInvisible | undefined = $state(undefined);
+	let isRecording = $state(false);
+	let message = $state('');
+	let pasteLongTextToFileLength = $derived.by(() => {
+		const n = Number(currentConfig.pasteLongTextToFileLen);
+		return Number.isNaN(n) ? Number(SETTING_CONFIG_DEFAULT.pasteLongTextToFileLen) : n;
+	});
+	let previousIsLoading = $state(isLoading);
+	let recordingSupported = $state(false);
+	let textareaRef: ChatFormTextarea | undefined = $state(undefined);
+
+	// Check if model is selected (in ROUTER mode)
+	let conversationModel = $derived(
+		chatStore.getConversationModel(activeMessages() as DatabaseMessage[])
+	);
+	let isRouter = $derived(isRouterMode());
+	let hasModelSelected = $derived(!isRouter || !!conversationModel || !!selectedModelId());
+
+	// Get active model ID for capability detection
+	let activeModelId = $derived.by(() => {
+		const options = modelOptions();
+
+		if (!isRouter) {
+			return options.length > 0 ? options[0].model : null;
+		}
+
+		// First try user-selected model
+		const selectedId = selectedModelId();
+		if (selectedId) {
+			const model = options.find((m) => m.id === selectedId);
+			if (model) return model.model;
+		}
+
+		// Fallback to conversation model
+		if (conversationModel) {
+			const model = options.find((m) => m.model === conversationModel);
+			if (model) return model.model;
+		}
+
+		return null;
+	});
+
+	function checkModelSelected(): boolean {
+		if (!hasModelSelected) {
+			// Open the model selector
+			chatFormActionsRef?.openModelSelector();
+			return false;
+		}
+
+		return true;
+	}
+
+	function handleFileSelect(files: File[]) {
+		onFileUpload?.(files);
+	}
+
+	function handleFileUpload() {
+		fileInputRef?.click();
+	}
+
+	async function handleKeydown(event: KeyboardEvent) {
+		if (event.key === 'Enter' && !event.shiftKey && !isIMEComposing(event)) {
+			event.preventDefault();
+
+			if ((!message.trim() && uploadedFiles.length === 0) || disabled || isLoading) return;
+
+			if (!checkModelSelected()) return;
+
+			const messageToSend = message.trim();
+			const filesToSend = [...uploadedFiles];
+
+			message = '';
+			uploadedFiles = [];
+
+			textareaRef?.resetHeight();
+
+			const success = await onSend?.(messageToSend, filesToSend);
+
+			if (!success) {
+				message = messageToSend;
+				uploadedFiles = filesToSend;
+			}
+		}
+	}
+
+	function handlePaste(event: ClipboardEvent) {
+		if (!event.clipboardData) return;
+
+		const files = Array.from(event.clipboardData.items)
+			.filter((item) => item.kind === 'file')
+			.map((item) => item.getAsFile())
+			.filter((file): file is File => file !== null);
+
+		if (files.length > 0) {
+			event.preventDefault();
+			onFileUpload?.(files);
+
+			return;
+		}
+
+		const text = event.clipboardData.getData(MimeTypeText.PLAIN);
+
+		if (text.startsWith('"')) {
+			const parsed = parseClipboardContent(text);
+
+			if (parsed.textAttachments.length > 0) {
+				event.preventDefault();
+
+				message = parsed.message;
+
+				const attachmentFiles = parsed.textAttachments.map(
+					(att) =>
+						new File([att.content], att.name, {
+							type: MimeTypeText.PLAIN
+						})
+				);
+
+				onFileUpload?.(attachmentFiles);
+
+				setTimeout(() => {
+					textareaRef?.focus();
+				}, 10);
+
+				return;
+			}
+		}
+
+		if (
+			text.length > 0 &&
+			pasteLongTextToFileLength > 0 &&
+			text.length > pasteLongTextToFileLength
+		) {
+			event.preventDefault();
+
+			const textFile = new File([text], 'Pasted', {
+				type: MimeTypeText.PLAIN
+			});
+
+			onFileUpload?.([textFile]);
+		}
+	}
+
+	async function handleMicClick() {
+		if (!audioRecorder || !recordingSupported) {
+			console.warn('Audio recording not supported');
+
+			return;
+		}
+
+		if (isRecording) {
+			try {
+				const audioBlob = await audioRecorder.stopRecording();
+				const wavBlob = await convertToWav(audioBlob);
+				const audioFile = createAudioFile(wavBlob);
+
+				onFileUpload?.([audioFile]);
+				isRecording = false;
+			} catch (error) {
+				console.error('Failed to stop recording:', error);
+				isRecording = false;
+			}
+		} else {
+			try {
+				await audioRecorder.startRecording();
+				isRecording = true;
+			} catch (error) {
+				console.error('Failed to start recording:', error);
+			}
+		}
+	}
+
+	function handleStop() {
+		onStop?.();
+	}
+
+	async function handleSubmit(event: SubmitEvent) {
+		event.preventDefault();
+		if ((!message.trim() && uploadedFiles.length === 0) || disabled || isLoading) return;
+
+		// Check if model is selected first
+		if (!checkModelSelected()) return;
+
+		const messageToSend = message.trim();
+		const filesToSend = [...uploadedFiles];
+
+		message = '';
+		uploadedFiles = [];
+
+		textareaRef?.resetHeight();
+
+		const success = await onSend?.(messageToSend, filesToSend);
+
+		if (!success) {
+			message = messageToSend;
+			uploadedFiles = filesToSend;
+		}
+	}
+
+	onMount(() => {
+		setTimeout(() => textareaRef?.focus(), 10);
+		recordingSupported = isAudioRecordingSupported();
+		audioRecorder = new AudioRecorder();
+	});
+
+	afterNavigate(() => {
+		setTimeout(() => textareaRef?.focus(), 10);
+	});
+
+	$effect(() => {
+		if (previousIsLoading && !isLoading) {
+			setTimeout(() => textareaRef?.focus(), 10);
+		}
+
+		previousIsLoading = isLoading;
+	});
+</script>
+
+<ChatFormFileInputInvisible bind:this={fileInputRef} onFileSelect={handleFileSelect} />
+
+<form
+	onsubmit={handleSubmit}
+	class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {disabled
+		? 'cursor-not-allowed opacity-60'
+		: ''} {className}"
+	data-slot="chat-form"
+>
+	<ChatAttachmentsList
+		bind:uploadedFiles
+		{onFileRemove}
+		limitToSingleRow
+		class="py-5"
+		style="scroll-padding: 1rem;"
+		activeModelId={activeModelId ?? undefined}
+	/>
+
+	<div
+		class="flex-column relative min-h-[48px] items-center rounded-3xl px-5 py-3 shadow-sm transition-all focus-within:shadow-md"
+		onpaste={handlePaste}
+	>
+		<ChatFormTextarea
+			bind:this={textareaRef}
+			bind:value={message}
+			onKeydown={handleKeydown}
+			{disabled}
+		/>
+
+		<ChatFormActions
+			bind:this={chatFormActionsRef}
+			canSend={message.trim().length > 0 || uploadedFiles.length > 0}
+			hasText={message.trim().length > 0}
+			{disabled}
+			{isLoading}
+			{isRecording}
+			{uploadedFiles}
+			onFileUpload={handleFileUpload}
+			onMicClick={handleMicClick}
+			onStop={handleStop}
+		/>
+	</div>
+</form>
+
+<ChatFormHelperText show={showHelperText} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte
new file mode 100644
index 0000000..dd37268
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte
@@ -0,0 +1,123 @@
+<script lang="ts">
+	import { Paperclip } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+	import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import { FILE_TYPE_ICONS } from '$lib/constants/icons';
+
+	interface Props {
+		class?: string;
+		disabled?: boolean;
+		hasAudioModality?: boolean;
+		hasVisionModality?: boolean;
+		onFileUpload?: () => void;
+	}
+
+	let {
+		class: className = '',
+		disabled = false,
+		hasAudioModality = false,
+		hasVisionModality = false,
+		onFileUpload
+	}: Props = $props();
+
+	const fileUploadTooltipText = $derived.by(() => {
+		return !hasVisionModality
+			? 'Text files and PDFs supported. Images, audio, and video require vision models.'
+			: 'Attach files';
+	});
+</script>
+
+<div class="flex items-center gap-1 {className}">
+	<DropdownMenu.Root>
+		<DropdownMenu.Trigger name="Attach files" {disabled}>
+			<Tooltip.Root>
+				<Tooltip.Trigger>
+					<Button
+						class="file-upload-button h-8 w-8 rounded-full bg-transparent p-0 text-muted-foreground hover:bg-foreground/10 hover:text-foreground"
+						{disabled}
+						type="button"
+					>
+						<span class="sr-only">Attach files</span>
+
+						<Paperclip class="h-4 w-4" />
+					</Button>
+				</Tooltip.Trigger>
+
+				<Tooltip.Content>
+					<p>{fileUploadTooltipText}</p>
+				</Tooltip.Content>
+			</Tooltip.Root>
+		</DropdownMenu.Trigger>
+
+		<DropdownMenu.Content align="start" class="w-48">
+			<Tooltip.Root>
+				<Tooltip.Trigger class="w-full">
+					<DropdownMenu.Item
+						class="images-button flex cursor-pointer items-center gap-2"
+						disabled={!hasVisionModality}
+						onclick={() => onFileUpload?.()}
+					>
+						<FILE_TYPE_ICONS.image class="h-4 w-4" />
+
+						<span>Images</span>
+					</DropdownMenu.Item>
+				</Tooltip.Trigger>
+
+				{#if !hasVisionModality}
+					<Tooltip.Content>
+						<p>Images require vision models to be processed</p>
+					</Tooltip.Content>
+				{/if}
+			</Tooltip.Root>
+
+			<Tooltip.Root>
+				<Tooltip.Trigger class="w-full">
+					<DropdownMenu.Item
+						class="audio-button flex cursor-pointer items-center gap-2"
+						disabled={!hasAudioModality}
+						onclick={() => onFileUpload?.()}
+					>
+						<FILE_TYPE_ICONS.audio class="h-4 w-4" />
+
+						<span>Audio Files</span>
+					</DropdownMenu.Item>
+				</Tooltip.Trigger>
+
+				{#if !hasAudioModality}
+					<Tooltip.Content>
+						<p>Audio files require audio models to be processed</p>
+					</Tooltip.Content>
+				{/if}
+			</Tooltip.Root>
+
+			<DropdownMenu.Item
+				class="flex cursor-pointer items-center gap-2"
+				onclick={() => onFileUpload?.()}
+			>
+				<FILE_TYPE_ICONS.text class="h-4 w-4" />
+
+				<span>Text Files</span>
+			</DropdownMenu.Item>
+
+			<Tooltip.Root>
+				<Tooltip.Trigger class="w-full">
+					<DropdownMenu.Item
+						class="flex cursor-pointer items-center gap-2"
+						onclick={() => onFileUpload?.()}
+					>
+						<FILE_TYPE_ICONS.pdf class="h-4 w-4" />
+
+						<span>PDF Files</span>
+					</DropdownMenu.Item>
+				</Tooltip.Trigger>
+
+				{#if !hasVisionModality}
+					<Tooltip.Content>
+						<p>PDFs will be converted to text. Image-based PDFs may not work properly.</p>
+					</Tooltip.Content>
+				{/if}
+			</Tooltip.Root>
+		</DropdownMenu.Content>
+	</DropdownMenu.Root>
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte
new file mode 100644
index 0000000..f1b0849
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte
@@ -0,0 +1,52 @@
+<script lang="ts">
+	import { Mic, Square } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+
+	interface Props {
+		class?: string;
+		disabled?: boolean;
+		hasAudioModality?: boolean;
+		isLoading?: boolean;
+		isRecording?: boolean;
+		onMicClick?: () => void;
+	}
+
+	let {
+		class: className = '',
+		disabled = false,
+		hasAudioModality = false,
+		isLoading = false,
+		isRecording = false,
+		onMicClick
+	}: Props = $props();
+</script>
+
+<div class="flex items-center gap-1 {className}">
+	<Tooltip.Root>
+		<Tooltip.Trigger>
+			<Button
+				class="h-8 w-8 rounded-full p-0 {isRecording
+					? 'animate-pulse bg-red-500 text-white hover:bg-red-600'
+					: ''}"
+				disabled={disabled || isLoading || !hasAudioModality}
+				onclick={onMicClick}
+				type="button"
+			>
+				<span class="sr-only">{isRecording ? 'Stop recording' : 'Start recording'}</span>
+
+				{#if isRecording}
+					<Square class="h-4 w-4 animate-pulse fill-white" />
+				{:else}
+					<Mic class="h-4 w-4" />
+				{/if}
+			</Button>
+		</Tooltip.Trigger>
+
+		{#if !hasAudioModality}
+			<Tooltip.Content>
+				<p>Current model does not support audio</p>
+			</Tooltip.Content>
+		{/if}
+	</Tooltip.Root>
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte
new file mode 100644
index 0000000..861cd18
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte
@@ -0,0 +1,55 @@
+<script lang="ts">
+	import { ArrowUp } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import { cn } from '$lib/components/ui/utils';
+
+	interface Props {
+		canSend?: boolean;
+		disabled?: boolean;
+		isLoading?: boolean;
+		showErrorState?: boolean;
+		tooltipLabel?: string;
+	}
+
+	let {
+		canSend = false,
+		disabled = false,
+		isLoading = false,
+		showErrorState = false,
+		tooltipLabel
+	}: Props = $props();
+
+	let isDisabled = $derived(!canSend || disabled || isLoading);
+</script>
+
+{#snippet submitButton(props = {})}
+	<Button
+		type="submit"
+		disabled={isDisabled}
+		class={cn(
+			'h-8 w-8 rounded-full p-0',
+			showErrorState
+				? 'bg-red-400/10 text-red-400 hover:bg-red-400/20 hover:text-red-400 disabled:opacity-100'
+				: ''
+		)}
+		{...props}
+	>
+		<span class="sr-only">Send</span>
+		<ArrowUp class="h-12 w-12" />
+	</Button>
+{/snippet}
+
+{#if tooltipLabel}
+	<Tooltip.Root>
+		<Tooltip.Trigger>
+			{@render submitButton()}
+		</Tooltip.Trigger>
+
+		<Tooltip.Content>
+			<p>{tooltipLabel}</p>
+		</Tooltip.Content>
+	</Tooltip.Root>
+{:else}
+	{@render submitButton()}
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
new file mode 100644
index 0000000..dde9bda
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
@@ -0,0 +1,204 @@
+<script lang="ts">
+	import { Square } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+	import {
+		ChatFormActionFileAttachments,
+		ChatFormActionRecord,
+		ChatFormActionSubmit,
+		ModelsSelector
+	} from '$lib/components/app';
+	import { FileTypeCategory } from '$lib/enums';
+	import { getFileTypeCategory } from '$lib/utils';
+	import { config } from '$lib/stores/settings.svelte';
+	import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
+	import { isRouterMode } from '$lib/stores/server.svelte';
+	import { chatStore } from '$lib/stores/chat.svelte';
+	import { activeMessages, usedModalities } from '$lib/stores/conversations.svelte';
+	import { useModelChangeValidation } from '$lib/hooks/use-model-change-validation.svelte';
+
+	interface Props {
+		canSend?: boolean;
+		class?: string;
+		disabled?: boolean;
+		isLoading?: boolean;
+		isRecording?: boolean;
+		hasText?: boolean;
+		uploadedFiles?: ChatUploadedFile[];
+		onFileUpload?: () => void;
+		onMicClick?: () => void;
+		onStop?: () => void;
+	}
+
+	let {
+		canSend = false,
+		class: className = '',
+		disabled = false,
+		isLoading = false,
+		isRecording = false,
+		hasText = false,
+		uploadedFiles = [],
+		onFileUpload,
+		onMicClick,
+		onStop
+	}: Props = $props();
+
+	let currentConfig = $derived(config());
+	let isRouter = $derived(isRouterMode());
+
+	let conversationModel = $derived(
+		chatStore.getConversationModel(activeMessages() as DatabaseMessage[])
+	);
+
+	let previousConversationModel: string | null = null;
+
+	$effect(() => {
+		if (conversationModel && conversationModel !== previousConversationModel) {
+			previousConversationModel = conversationModel;
+			modelsStore.selectModelByName(conversationModel);
+		}
+	});
+
+	let activeModelId = $derived.by(() => {
+		const options = modelOptions();
+
+		if (!isRouter) {
+			return options.length > 0 ? options[0].model : null;
+		}
+
+		const selectedId = selectedModelId();
+		if (selectedId) {
+			const model = options.find((m) => m.id === selectedId);
+			if (model) return model.model;
+		}
+
+		if (conversationModel) {
+			const model = options.find((m) => m.model === conversationModel);
+			if (model) return model.model;
+		}
+
+		return null;
+	});
+
+	let modelPropsVersion = $state(0); // Used to trigger reactivity after fetch
+
+	$effect(() => {
+		if (activeModelId) {
+			const cached = modelsStore.getModelProps(activeModelId);
+
+			if (!cached) {
+				modelsStore.fetchModelProps(activeModelId).then(() => {
+					modelPropsVersion++;
+				});
+			}
+		}
+	});
+
+	let hasAudioModality = $derived.by(() => {
+		if (activeModelId) {
+			void modelPropsVersion;
+
+			return modelsStore.modelSupportsAudio(activeModelId);
+		}
+
+		return false;
+	});
+
+	let hasVisionModality = $derived.by(() => {
+		if (activeModelId) {
+			void modelPropsVersion;
+
+			return modelsStore.modelSupportsVision(activeModelId);
+		}
+
+		return false;
+	});
+
+	let hasAudioAttachments = $derived(
+		uploadedFiles.some((file) => getFileTypeCategory(file.type) === FileTypeCategory.AUDIO)
+	);
+	let shouldShowRecordButton = $derived(
+		hasAudioModality && !hasText && !hasAudioAttachments && currentConfig.autoMicOnEmpty
+	);
+
+	let hasModelSelected = $derived(!isRouter || !!conversationModel || !!selectedModelId());
+
+	let isSelectedModelInCache = $derived.by(() => {
+		if (!isRouter) return true;
+
+		if (conversationModel) {
+			return modelOptions().some((option) => option.model === conversationModel);
+		}
+
+		const currentModelId = selectedModelId();
+		if (!currentModelId) return false;
+
+		return modelOptions().some((option) => option.id === currentModelId);
+	});
+
+	let submitTooltip = $derived.by(() => {
+		if (!hasModelSelected) {
+			return 'Please select a model first';
+		}
+
+		if (!isSelectedModelInCache) {
+			return 'Selected model is not available, please select another';
+		}
+
+		return '';
+	});
+
+	let selectorModelRef: ModelsSelector | undefined = $state(undefined);
+
+	export function openModelSelector() {
+		selectorModelRef?.open();
+	}
+
+	const { handleModelChange } = useModelChangeValidation({
+		getRequiredModalities: () => usedModalities(),
+		onValidationFailure: async (previousModelId) => {
+			if (previousModelId) {
+				await modelsStore.selectModelById(previousModelId);
+			}
+		}
+	});
+</script>
+
+<div class="flex w-full items-center gap-3 {className}" style="container-type: inline-size">
+	<ChatFormActionFileAttachments
+		class="mr-auto"
+		{disabled}
+		{hasAudioModality}
+		{hasVisionModality}
+		{onFileUpload}
+	/>
+
+	<ModelsSelector
+		{disabled}
+		bind:this={selectorModelRef}
+		currentModel={conversationModel}
+		forceForegroundText={true}
+		useGlobalSelection={true}
+		onModelChange={handleModelChange}
+	/>
+
+	{#if isLoading}
+		<Button
+			type="button"
+			onclick={onStop}
+			class="h-8 w-8 bg-transparent p-0 hover:bg-destructive/20"
+		>
+			<span class="sr-only">Stop</span>
+			<Square class="h-8 w-8 fill-destructive stroke-destructive" />
+		</Button>
+	{:else if shouldShowRecordButton}
+		<ChatFormActionRecord {disabled} {hasAudioModality} {isLoading} {isRecording} {onMicClick} />
+	{:else}
+		<ChatFormActionSubmit
+			canSend={canSend && hasModelSelected && isSelectedModelInCache}
+			{disabled}
+			{isLoading}
+			tooltipLabel={submitTooltip}
+			showErrorState={hasModelSelected && !isSelectedModelInCache}
+		/>
+	{/if}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte
new file mode 100644
index 0000000..d758822
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte
@@ -0,0 +1,30 @@
+<script lang="ts">
+	interface Props {
+		class?: string;
+		multiple?: boolean;
+		onFileSelect?: (files: File[]) => void;
+	}
+
+	let { class: className = '', multiple = true, onFileSelect }: Props = $props();
+
+	let fileInputElement: HTMLInputElement | undefined;
+
+	export function click() {
+		fileInputElement?.click();
+	}
+
+	function handleFileSelect(event: Event) {
+		const input = event.target as HTMLInputElement;
+		if (input.files) {
+			onFileSelect?.(Array.from(input.files));
+		}
+	}
+</script>
+
+<input
+	bind:this={fileInputElement}
+	type="file"
+	{multiple}
+	onchange={handleFileSelect}
+	class="hidden {className}"
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormHelperText.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormHelperText.svelte
new file mode 100644
index 0000000..f8246f2
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormHelperText.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	interface Props {
+		class?: string;
+		show?: boolean;
+	}
+
+	let { class: className = '', show = true }: Props = $props();
+</script>
+
+{#if show}
+	<div class="mt-4 flex items-center justify-center {className}">
+		<p class="text-xs text-muted-foreground">
+			Press <kbd class="rounded bg-muted px-1 py-0.5 font-mono text-xs">Enter</kbd> to send,
+			<kbd class="rounded bg-muted px-1 py-0.5 font-mono text-xs">Shift + Enter</kbd> for new line
+		</p>
+	</div>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte
new file mode 100644
index 0000000..19b763f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte
@@ -0,0 +1,59 @@
+<script lang="ts">
+	import { autoResizeTextarea } from '$lib/utils';
+	import { onMount } from 'svelte';
+
+	interface Props {
+		class?: string;
+		disabled?: boolean;
+		onKeydown?: (event: KeyboardEvent) => void;
+		onPaste?: (event: ClipboardEvent) => void;
+		placeholder?: string;
+		value?: string;
+	}
+
+	let {
+		class: className = '',
+		disabled = false,
+		onKeydown,
+		onPaste,
+		placeholder = 'Ask anything...',
+		value = $bindable('')
+	}: Props = $props();
+
+	let textareaElement: HTMLTextAreaElement | undefined;
+
+	onMount(() => {
+		if (textareaElement) {
+			textareaElement.focus();
+		}
+	});
+
+	// Expose the textarea element for external access
+	export function getElement() {
+		return textareaElement;
+	}
+
+	export function focus() {
+		textareaElement?.focus();
+	}
+
+	export function resetHeight() {
+		if (textareaElement) {
+			textareaElement.style.height = '1rem';
+		}
+	}
+</script>
+
+<div class="flex-1 {className}">
+	<textarea
+		bind:this={textareaElement}
+		bind:value
+		class="text-md max-h-32 min-h-12 w-full resize-none border-0 bg-transparent p-0 leading-6 outline-none placeholder:text-muted-foreground focus-visible:ring-0 focus-visible:ring-offset-0"
+		class:cursor-not-allowed={disabled}
+		{disabled}
+		onkeydown={onKeydown}
+		oninput={(event) => autoResizeTextarea(event.currentTarget)}
+		onpaste={onPaste}
+		{placeholder}
+	></textarea>
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
new file mode 100644
index 0000000..220276f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
@@ -0,0 +1,286 @@
+<script lang="ts">
+	import { chatStore } from '$lib/stores/chat.svelte';
+	import { config } from '$lib/stores/settings.svelte';
+	import { copyToClipboard, isIMEComposing, formatMessageForClipboard } from '$lib/utils';
+	import ChatMessageAssistant from './ChatMessageAssistant.svelte';
+	import ChatMessageUser from './ChatMessageUser.svelte';
+	import ChatMessageSystem from './ChatMessageSystem.svelte';
+
+	interface Props {
+		class?: string;
+		message: DatabaseMessage;
+		onCopy?: (message: DatabaseMessage) => void;
+		onContinueAssistantMessage?: (message: DatabaseMessage) => void;
+		onDelete?: (message: DatabaseMessage) => void;
+		onEditWithBranching?: (
+			message: DatabaseMessage,
+			newContent: string,
+			newExtras?: DatabaseMessageExtra[]
+		) => void;
+		onEditWithReplacement?: (
+			message: DatabaseMessage,
+			newContent: string,
+			shouldBranch: boolean
+		) => void;
+		onEditUserMessagePreserveResponses?: (
+			message: DatabaseMessage,
+			newContent: string,
+			newExtras?: DatabaseMessageExtra[]
+		) => void;
+		onNavigateToSibling?: (siblingId: string) => void;
+		onRegenerateWithBranching?: (message: DatabaseMessage, modelOverride?: string) => void;
+		siblingInfo?: ChatMessageSiblingInfo | null;
+	}
+
+	let {
+		class: className = '',
+		message,
+		onCopy,
+		onContinueAssistantMessage,
+		onDelete,
+		onEditWithBranching,
+		onEditWithReplacement,
+		onEditUserMessagePreserveResponses,
+		onNavigateToSibling,
+		onRegenerateWithBranching,
+		siblingInfo = null
+	}: Props = $props();
+
+	let deletionInfo = $state<{
+		totalCount: number;
+		userMessages: number;
+		assistantMessages: number;
+		messageTypes: string[];
+	} | null>(null);
+	let editedContent = $state(message.content);
+	let editedExtras = $state<DatabaseMessageExtra[]>(message.extra ? [...message.extra] : []);
+	let editedUploadedFiles = $state<ChatUploadedFile[]>([]);
+	let isEditing = $state(false);
+	let showDeleteDialog = $state(false);
+	let shouldBranchAfterEdit = $state(false);
+	let textareaElement: HTMLTextAreaElement | undefined = $state();
+
+	let thinkingContent = $derived.by(() => {
+		if (message.role === 'assistant') {
+			const trimmedThinking = message.thinking?.trim();
+
+			return trimmedThinking ? trimmedThinking : null;
+		}
+		return null;
+	});
+
+	let toolCallContent = $derived.by((): ApiChatCompletionToolCall[] | string | null => {
+		if (message.role === 'assistant') {
+			const trimmedToolCalls = message.toolCalls?.trim();
+
+			if (!trimmedToolCalls) {
+				return null;
+			}
+
+			try {
+				const parsed = JSON.parse(trimmedToolCalls);
+
+				if (Array.isArray(parsed)) {
+					return parsed as ApiChatCompletionToolCall[];
+				}
+			} catch {
+				// Harmony-only path: fall back to the raw string so issues surface visibly.
+			}
+
+			return trimmedToolCalls;
+		}
+		return null;
+	});
+
+	function handleCancelEdit() {
+		isEditing = false;
+		editedContent = message.content;
+		editedExtras = message.extra ? [...message.extra] : [];
+		editedUploadedFiles = [];
+	}
+
+	function handleEditedExtrasChange(extras: DatabaseMessageExtra[]) {
+		editedExtras = extras;
+	}
+
+	function handleEditedUploadedFilesChange(files: ChatUploadedFile[]) {
+		editedUploadedFiles = files;
+	}
+
+	async function handleCopy() {
+		const asPlainText = Boolean(config().copyTextAttachmentsAsPlainText);
+		const clipboardContent = formatMessageForClipboard(message.content, message.extra, asPlainText);
+		await copyToClipboard(clipboardContent, 'Message copied to clipboard');
+		onCopy?.(message);
+	}
+
+	function handleConfirmDelete() {
+		onDelete?.(message);
+		showDeleteDialog = false;
+	}
+
+	async function handleDelete() {
+		deletionInfo = await chatStore.getDeletionInfo(message.id);
+		showDeleteDialog = true;
+	}
+
+	function handleEdit() {
+		isEditing = true;
+		editedContent = message.content;
+		editedExtras = message.extra ? [...message.extra] : [];
+		editedUploadedFiles = [];
+
+		setTimeout(() => {
+			if (textareaElement) {
+				textareaElement.focus();
+				textareaElement.setSelectionRange(
+					textareaElement.value.length,
+					textareaElement.value.length
+				);
+			}
+		}, 0);
+	}
+
+	function handleEditedContentChange(content: string) {
+		editedContent = content;
+	}
+
+	function handleEditKeydown(event: KeyboardEvent) {
+		// Check for IME composition using isComposing property and keyCode 229 (specifically for IME composition on Safari)
+		// This prevents saving edit when confirming IME word selection (e.g., Japanese/Chinese input)
+		if (event.key === 'Enter' && !event.shiftKey && !isIMEComposing(event)) {
+			event.preventDefault();
+			handleSaveEdit();
+		} else if (event.key === 'Escape') {
+			event.preventDefault();
+			handleCancelEdit();
+		}
+	}
+
+	function handleRegenerate(modelOverride?: string) {
+		onRegenerateWithBranching?.(message, modelOverride);
+	}
+
+	function handleContinue() {
+		onContinueAssistantMessage?.(message);
+	}
+
+	async function handleSaveEdit() {
+		if (message.role === 'user' || message.role === 'system') {
+			const finalExtras = await getMergedExtras();
+			onEditWithBranching?.(message, editedContent.trim(), finalExtras);
+		} else {
+			// For assistant messages, preserve exact content including trailing whitespace
+			// This is important for the Continue feature to work properly
+			onEditWithReplacement?.(message, editedContent, shouldBranchAfterEdit);
+		}
+
+		isEditing = false;
+		shouldBranchAfterEdit = false;
+		editedUploadedFiles = [];
+	}
+
+	async function handleSaveEditOnly() {
+		if (message.role === 'user') {
+			// For user messages, trim to avoid accidental whitespace
+			const finalExtras = await getMergedExtras();
+			onEditUserMessagePreserveResponses?.(message, editedContent.trim(), finalExtras);
+		}
+
+		isEditing = false;
+		editedUploadedFiles = [];
+	}
+
+	async function getMergedExtras(): Promise<DatabaseMessageExtra[]> {
+		if (editedUploadedFiles.length === 0) {
+			return editedExtras;
+		}
+
+		const { parseFilesToMessageExtras } = await import('$lib/utils/browser-only');
+		const result = await parseFilesToMessageExtras(editedUploadedFiles);
+		const newExtras = result?.extras || [];
+
+		return [...editedExtras, ...newExtras];
+	}
+
+	function handleShowDeleteDialogChange(show: boolean) {
+		showDeleteDialog = show;
+	}
+</script>
+
+{#if message.role === 'system'}
+	<ChatMessageSystem
+		bind:textareaElement
+		class={className}
+		{deletionInfo}
+		{editedContent}
+		{isEditing}
+		{message}
+		onCancelEdit={handleCancelEdit}
+		onConfirmDelete={handleConfirmDelete}
+		onCopy={handleCopy}
+		onDelete={handleDelete}
+		onEdit={handleEdit}
+		onEditKeydown={handleEditKeydown}
+		onEditedContentChange={handleEditedContentChange}
+		{onNavigateToSibling}
+		onSaveEdit={handleSaveEdit}
+		onShowDeleteDialogChange={handleShowDeleteDialogChange}
+		{showDeleteDialog}
+		{siblingInfo}
+	/>
+{:else if message.role === 'user'}
+	<ChatMessageUser
+		bind:textareaElement
+		class={className}
+		{deletionInfo}
+		{editedContent}
+		{editedExtras}
+		{editedUploadedFiles}
+		{isEditing}
+		{message}
+		onCancelEdit={handleCancelEdit}
+		onConfirmDelete={handleConfirmDelete}
+		onCopy={handleCopy}
+		onDelete={handleDelete}
+		onEdit={handleEdit}
+		onEditKeydown={handleEditKeydown}
+		onEditedContentChange={handleEditedContentChange}
+		onEditedExtrasChange={handleEditedExtrasChange}
+		onEditedUploadedFilesChange={handleEditedUploadedFilesChange}
+		{onNavigateToSibling}
+		onSaveEdit={handleSaveEdit}
+		onSaveEditOnly={handleSaveEditOnly}
+		onShowDeleteDialogChange={handleShowDeleteDialogChange}
+		{showDeleteDialog}
+		{siblingInfo}
+	/>
+{:else}
+	<ChatMessageAssistant
+		bind:textareaElement
+		class={className}
+		{deletionInfo}
+		{editedContent}
+		{isEditing}
+		{message}
+		messageContent={message.content}
+		onCancelEdit={handleCancelEdit}
+		onConfirmDelete={handleConfirmDelete}
+		onContinue={handleContinue}
+		onCopy={handleCopy}
+		onDelete={handleDelete}
+		onEdit={handleEdit}
+		onEditKeydown={handleEditKeydown}
+		onEditedContentChange={handleEditedContentChange}
+		{onNavigateToSibling}
+		onRegenerate={handleRegenerate}
+		onSaveEdit={handleSaveEdit}
+		onShowDeleteDialogChange={handleShowDeleteDialogChange}
+		{shouldBranchAfterEdit}
+		onShouldBranchAfterEditChange={(value) => (shouldBranchAfterEdit = value)}
+		{showDeleteDialog}
+		{siblingInfo}
+		{thinkingContent}
+		{toolCallContent}
+	/>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte
new file mode 100644
index 0000000..3cb4815
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte
@@ -0,0 +1,100 @@
+<script lang="ts">
+	import { Edit, Copy, RefreshCw, Trash2, ArrowRight } from '@lucide/svelte';
+	import {
+		ActionButton,
+		ChatMessageBranchingControls,
+		DialogConfirmation
+	} from '$lib/components/app';
+
+	interface Props {
+		role: 'user' | 'assistant';
+		justify: 'start' | 'end';
+		actionsPosition: 'left' | 'right';
+		siblingInfo?: ChatMessageSiblingInfo | null;
+		showDeleteDialog: boolean;
+		deletionInfo: {
+			totalCount: number;
+			userMessages: number;
+			assistantMessages: number;
+			messageTypes: string[];
+		} | null;
+		onCopy: () => void;
+		onEdit?: () => void;
+		onRegenerate?: () => void;
+		onContinue?: () => void;
+		onDelete: () => void;
+		onConfirmDelete: () => void;
+		onNavigateToSibling?: (siblingId: string) => void;
+		onShowDeleteDialogChange: (show: boolean) => void;
+	}
+
+	let {
+		actionsPosition,
+		deletionInfo,
+		justify,
+		onCopy,
+		onEdit,
+		onConfirmDelete,
+		onContinue,
+		onDelete,
+		onNavigateToSibling,
+		onShowDeleteDialogChange,
+		onRegenerate,
+		role,
+		siblingInfo = null,
+		showDeleteDialog
+	}: Props = $props();
+
+	function handleConfirmDelete() {
+		onConfirmDelete();
+		onShowDeleteDialogChange(false);
+	}
+</script>
+
+<div class="relative {justify === 'start' ? 'mt-2' : ''} flex h-6 items-center justify-{justify}">
+	<div
+		class="absolute top-0 {actionsPosition === 'left'
+			? 'left-0'
+			: 'right-0'} flex items-center gap-2 opacity-100 transition-opacity"
+	>
+		{#if siblingInfo && siblingInfo.totalSiblings > 1}
+			<ChatMessageBranchingControls {siblingInfo} {onNavigateToSibling} />
+		{/if}
+
+		<div
+			class="pointer-events-auto inset-0 flex items-center gap-1 opacity-100 transition-all duration-150"
+		>
+			<ActionButton icon={Copy} tooltip="Copy" onclick={onCopy} />
+
+			{#if onEdit}
+				<ActionButton icon={Edit} tooltip="Edit" onclick={onEdit} />
+			{/if}
+
+			{#if role === 'assistant' && onRegenerate}
+				<ActionButton icon={RefreshCw} tooltip="Regenerate" onclick={() => onRegenerate()} />
+			{/if}
+
+			{#if role === 'assistant' && onContinue}
+				<ActionButton icon={ArrowRight} tooltip="Continue" onclick={onContinue} />
+			{/if}
+
+			<ActionButton icon={Trash2} tooltip="Delete" onclick={onDelete} />
+		</div>
+	</div>
+</div>
+
+<DialogConfirmation
+	bind:open={showDeleteDialog}
+	title="Delete Message"
+	description={deletionInfo && deletionInfo.totalCount > 1
+		? `This will delete ${deletionInfo.totalCount} messages including: ${deletionInfo.userMessages} user message${deletionInfo.userMessages > 1 ? 's' : ''} and ${deletionInfo.assistantMessages} assistant response${deletionInfo.assistantMessages > 1 ? 's' : ''}. All messages in this branch and their responses will be permanently removed. This action cannot be undone.`
+		: 'Are you sure you want to delete this message? This action cannot be undone.'}
+	confirmText={deletionInfo && deletionInfo.totalCount > 1
+		? `Delete ${deletionInfo.totalCount} Messages`
+		: 'Delete'}
+	cancelText="Cancel"
+	variant="destructive"
+	icon={Trash2}
+	onConfirm={handleConfirmDelete}
+	onCancel={() => onShowDeleteDialogChange(false)}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
new file mode 100644
index 0000000..2b34b1c
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
@@ -0,0 +1,418 @@
+<script lang="ts">
+	import {
+		ModelBadge,
+		ChatMessageActions,
+		ChatMessageStatistics,
+		ChatMessageThinkingBlock,
+		CopyToClipboardIcon,
+		MarkdownContent,
+		ModelsSelector
+	} from '$lib/components/app';
+	import { useProcessingState } from '$lib/hooks/use-processing-state.svelte';
+	import { useModelChangeValidation } from '$lib/hooks/use-model-change-validation.svelte';
+	import { isLoading } from '$lib/stores/chat.svelte';
+	import { autoResizeTextarea, copyToClipboard } from '$lib/utils';
+	import { fade } from 'svelte/transition';
+	import { Check, X, Wrench } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+	import { Checkbox } from '$lib/components/ui/checkbox';
+	import { INPUT_CLASSES } from '$lib/constants/input-classes';
+	import Label from '$lib/components/ui/label/label.svelte';
+	import { config } from '$lib/stores/settings.svelte';
+	import { conversationsStore } from '$lib/stores/conversations.svelte';
+	import { isRouterMode } from '$lib/stores/server.svelte';
+
+	interface Props {
+		class?: string;
+		deletionInfo: {
+			totalCount: number;
+			userMessages: number;
+			assistantMessages: number;
+			messageTypes: string[];
+		} | null;
+		editedContent?: string;
+		isEditing?: boolean;
+		message: DatabaseMessage;
+		messageContent: string | undefined;
+		onCancelEdit?: () => void;
+		onCopy: () => void;
+		onConfirmDelete: () => void;
+		onContinue?: () => void;
+		onDelete: () => void;
+		onEdit?: () => void;
+		onEditKeydown?: (event: KeyboardEvent) => void;
+		onEditedContentChange?: (content: string) => void;
+		onNavigateToSibling?: (siblingId: string) => void;
+		onRegenerate: (modelOverride?: string) => void;
+		onSaveEdit?: () => void;
+		onShowDeleteDialogChange: (show: boolean) => void;
+		onShouldBranchAfterEditChange?: (value: boolean) => void;
+		showDeleteDialog: boolean;
+		shouldBranchAfterEdit?: boolean;
+		siblingInfo?: ChatMessageSiblingInfo | null;
+		textareaElement?: HTMLTextAreaElement;
+		thinkingContent: string | null;
+		toolCallContent: ApiChatCompletionToolCall[] | string | null;
+	}
+
+	let {
+		class: className = '',
+		deletionInfo,
+		editedContent = '',
+		isEditing = false,
+		message,
+		messageContent,
+		onCancelEdit,
+		onConfirmDelete,
+		onContinue,
+		onCopy,
+		onDelete,
+		onEdit,
+		onEditKeydown,
+		onEditedContentChange,
+		onNavigateToSibling,
+		onRegenerate,
+		onSaveEdit,
+		onShowDeleteDialogChange,
+		onShouldBranchAfterEditChange,
+		showDeleteDialog,
+		shouldBranchAfterEdit = false,
+		siblingInfo = null,
+		textareaElement = $bindable(),
+		thinkingContent,
+		toolCallContent = null
+	}: Props = $props();
+
+	const toolCalls = $derived(
+		Array.isArray(toolCallContent) ? (toolCallContent as ApiChatCompletionToolCall[]) : null
+	);
+	const fallbackToolCalls = $derived(typeof toolCallContent === 'string' ? toolCallContent : null);
+
+	const processingState = useProcessingState();
+
+	let currentConfig = $derived(config());
+	let isRouter = $derived(isRouterMode());
+	let displayedModel = $derived((): string | null => {
+		if (message.model) {
+			return message.model;
+		}
+
+		return null;
+	});
+
+	const { handleModelChange } = useModelChangeValidation({
+		getRequiredModalities: () => conversationsStore.getModalitiesUpToMessage(message.id),
+		onSuccess: (modelName) => onRegenerate(modelName)
+	});
+
+	function handleCopyModel() {
+		const model = displayedModel();
+
+		void copyToClipboard(model ?? '');
+	}
+
+	$effect(() => {
+		if (isEditing && textareaElement) {
+			autoResizeTextarea(textareaElement);
+		}
+	});
+
+	$effect(() => {
+		if (isLoading() && !message?.content?.trim()) {
+			processingState.startMonitoring();
+		}
+	});
+
+	function formatToolCallBadge(toolCall: ApiChatCompletionToolCall, index: number) {
+		const callNumber = index + 1;
+		const functionName = toolCall.function?.name?.trim();
+		const label = functionName || `Call #${callNumber}`;
+
+		const payload: Record<string, unknown> = {};
+
+		const id = toolCall.id?.trim();
+		if (id) {
+			payload.id = id;
+		}
+
+		const type = toolCall.type?.trim();
+		if (type) {
+			payload.type = type;
+		}
+
+		if (toolCall.function) {
+			const fnPayload: Record<string, unknown> = {};
+
+			const name = toolCall.function.name?.trim();
+			if (name) {
+				fnPayload.name = name;
+			}
+
+			const rawArguments = toolCall.function.arguments?.trim();
+			if (rawArguments) {
+				try {
+					fnPayload.arguments = JSON.parse(rawArguments);
+				} catch {
+					fnPayload.arguments = rawArguments;
+				}
+			}
+
+			if (Object.keys(fnPayload).length > 0) {
+				payload.function = fnPayload;
+			}
+		}
+
+		const formattedPayload = JSON.stringify(payload, null, 2);
+
+		return {
+			label,
+			tooltip: formattedPayload,
+			copyValue: formattedPayload
+		};
+	}
+
+	function handleCopyToolCall(payload: string) {
+		void copyToClipboard(payload, 'Tool call copied to clipboard');
+	}
+</script>
+
+<div
+	class="text-md group w-full leading-7.5 {className}"
+	role="group"
+	aria-label="Assistant message with actions"
+>
+	{#if thinkingContent}
+		<ChatMessageThinkingBlock
+			reasoningContent={thinkingContent}
+			isStreaming={!message.timestamp}
+			hasRegularContent={!!messageContent?.trim()}
+		/>
+	{/if}
+
+	{#if message?.role === 'assistant' && isLoading() && !message?.content?.trim()}
+		<div class="mt-6 w-full max-w-[48rem]" in:fade>
+			<div class="processing-container">
+				<span class="processing-text">
+					{processingState.getPromptProgressText() ?? processingState.getProcessingMessage()}
+				</span>
+			</div>
+		</div>
+	{/if}
+
+	{#if isEditing}
+		<div class="w-full">
+			<textarea
+				bind:this={textareaElement}
+				bind:value={editedContent}
+				class="min-h-[50vh] w-full resize-y rounded-2xl px-3 py-2 text-sm {INPUT_CLASSES}"
+				onkeydown={onEditKeydown}
+				oninput={(e) => {
+					autoResizeTextarea(e.currentTarget);
+					onEditedContentChange?.(e.currentTarget.value);
+				}}
+				placeholder="Edit assistant message..."
+			></textarea>
+
+			<div class="mt-2 flex items-center justify-between">
+				<div class="flex items-center space-x-2">
+					<Checkbox
+						id="branch-after-edit"
+						bind:checked={shouldBranchAfterEdit}
+						onCheckedChange={(checked) => onShouldBranchAfterEditChange?.(checked === true)}
+					/>
+					<Label for="branch-after-edit" class="cursor-pointer text-sm text-muted-foreground">
+						Branch conversation after edit
+					</Label>
+				</div>
+				<div class="flex gap-2">
+					<Button class="h-8 px-3" onclick={onCancelEdit} size="sm" variant="outline">
+						<X class="mr-1 h-3 w-3" />
+						Cancel
+					</Button>
+
+					<Button class="h-8 px-3" onclick={onSaveEdit} disabled={!editedContent?.trim()} size="sm">
+						<Check class="mr-1 h-3 w-3" />
+						Save
+					</Button>
+				</div>
+			</div>
+		</div>
+	{:else if message.role === 'assistant'}
+		{#if config().disableReasoningFormat}
+			<pre class="raw-output">{messageContent || ''}</pre>
+		{:else}
+			<MarkdownContent content={messageContent || ''} />
+		{/if}
+	{:else}
+		<div class="text-sm whitespace-pre-wrap">
+			{messageContent}
+		</div>
+	{/if}
+
+	<div class="info my-6 grid gap-4 tabular-nums">
+		{#if displayedModel()}
+			<div class="inline-flex flex-wrap items-start gap-2 text-xs text-muted-foreground">
+				{#if isRouter}
+					<ModelsSelector
+						currentModel={displayedModel()}
+						onModelChange={handleModelChange}
+						disabled={isLoading()}
+						upToMessageId={message.id}
+					/>
+				{:else}
+					<ModelBadge model={displayedModel() || undefined} onclick={handleCopyModel} />
+				{/if}
+
+				{#if currentConfig.showMessageStats && message.timings && message.timings.predicted_n && message.timings.predicted_ms}
+					<ChatMessageStatistics
+						promptTokens={message.timings.prompt_n}
+						promptMs={message.timings.prompt_ms}
+						predictedTokens={message.timings.predicted_n}
+						predictedMs={message.timings.predicted_ms}
+					/>
+				{:else if isLoading() && currentConfig.showMessageStats}
+					{@const liveStats = processingState.getLiveProcessingStats()}
+					{@const genStats = processingState.getLiveGenerationStats()}
+					{@const promptProgress = processingState.processingState?.promptProgress}
+					{@const isStillProcessingPrompt =
+						promptProgress && promptProgress.processed < promptProgress.total}
+
+					{#if liveStats || genStats}
+						<ChatMessageStatistics
+							isLive={true}
+							isProcessingPrompt={!!isStillProcessingPrompt}
+							promptTokens={liveStats?.tokensProcessed}
+							promptMs={liveStats?.timeMs}
+							predictedTokens={genStats?.tokensGenerated}
+							predictedMs={genStats?.timeMs}
+						/>
+					{/if}
+				{/if}
+			</div>
+		{/if}
+
+		{#if config().showToolCalls}
+			{#if (toolCalls && toolCalls.length > 0) || fallbackToolCalls}
+				<span class="inline-flex flex-wrap items-center gap-2 text-xs text-muted-foreground">
+					<span class="inline-flex items-center gap-1">
+						<Wrench class="h-3.5 w-3.5" />
+
+						<span>Tool calls:</span>
+					</span>
+
+					{#if toolCalls && toolCalls.length > 0}
+						{#each toolCalls as toolCall, index (toolCall.id ?? `${index}`)}
+							{@const badge = formatToolCallBadge(toolCall, index)}
+							<button
+								type="button"
+								class="tool-call-badge inline-flex cursor-pointer items-center gap-1 rounded-sm bg-muted-foreground/15 px-1.5 py-0.75"
+								title={badge.tooltip}
+								aria-label={`Copy tool call ${badge.label}`}
+								onclick={() => handleCopyToolCall(badge.copyValue)}
+							>
+								{badge.label}
+								<CopyToClipboardIcon
+									text={badge.copyValue}
+									ariaLabel={`Copy tool call ${badge.label}`}
+								/>
+							</button>
+						{/each}
+					{:else if fallbackToolCalls}
+						<button
+							type="button"
+							class="tool-call-badge tool-call-badge--fallback inline-flex cursor-pointer items-center gap-1 rounded-sm bg-muted-foreground/15 px-1.5 py-0.75"
+							title={fallbackToolCalls}
+							aria-label="Copy tool call payload"
+							onclick={() => handleCopyToolCall(fallbackToolCalls)}
+						>
+							{fallbackToolCalls}
+							<CopyToClipboardIcon text={fallbackToolCalls} ariaLabel="Copy tool call payload" />
+						</button>
+					{/if}
+				</span>
+			{/if}
+		{/if}
+	</div>
+
+	{#if message.timestamp && !isEditing}
+		<ChatMessageActions
+			role="assistant"
+			justify="start"
+			actionsPosition="left"
+			{siblingInfo}
+			{showDeleteDialog}
+			{deletionInfo}
+			{onCopy}
+			{onEdit}
+			{onRegenerate}
+			onContinue={currentConfig.enableContinueGeneration && !thinkingContent
+				? onContinue
+				: undefined}
+			{onDelete}
+			{onConfirmDelete}
+			{onNavigateToSibling}
+			{onShowDeleteDialogChange}
+		/>
+	{/if}
+</div>
+
+<style>
+	.processing-container {
+		display: flex;
+		flex-direction: column;
+		align-items: flex-start;
+		gap: 0.5rem;
+	}
+
+	.processing-text {
+		background: linear-gradient(
+			90deg,
+			var(--muted-foreground),
+			var(--foreground),
+			var(--muted-foreground)
+		);
+		background-size: 200% 100%;
+		background-clip: text;
+		-webkit-background-clip: text;
+		-webkit-text-fill-color: transparent;
+		animation: shine 1s linear infinite;
+		font-weight: 500;
+		font-size: 0.875rem;
+	}
+
+	@keyframes shine {
+		to {
+			background-position: -200% 0;
+		}
+	}
+
+	.raw-output {
+		width: 100%;
+		max-width: 48rem;
+		margin-top: 1.5rem;
+		padding: 1rem 1.25rem;
+		border-radius: 1rem;
+		background: hsl(var(--muted) / 0.3);
+		color: var(--foreground);
+		font-family:
+			ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas,
+			'Liberation Mono', Menlo, monospace;
+		font-size: 0.875rem;
+		line-height: 1.6;
+		white-space: pre-wrap;
+		word-break: break-word;
+	}
+
+	.tool-call-badge {
+		max-width: 12rem;
+		white-space: nowrap;
+		overflow: hidden;
+		text-overflow: ellipsis;
+	}
+
+	.tool-call-badge--fallback {
+		max-width: 20rem;
+		white-space: normal;
+		word-break: break-word;
+	}
+</style>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageBranchingControls.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageBranchingControls.svelte
new file mode 100644
index 0000000..7420bb1
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageBranchingControls.svelte
@@ -0,0 +1,84 @@
+<script lang="ts">
+	import { ChevronLeft, ChevronRight } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+
+	interface Props {
+		class?: string;
+		siblingInfo: ChatMessageSiblingInfo | null;
+		onNavigateToSibling?: (siblingId: string) => void;
+	}
+
+	let { class: className = '', siblingInfo, onNavigateToSibling }: Props = $props();
+
+	let hasPrevious = $derived(siblingInfo && siblingInfo.currentIndex > 0);
+	let hasNext = $derived(siblingInfo && siblingInfo.currentIndex < siblingInfo.totalSiblings - 1);
+	let nextSiblingId = $derived(
+		hasNext ? siblingInfo!.siblingIds[siblingInfo!.currentIndex + 1] : null
+	);
+	let previousSiblingId = $derived(
+		hasPrevious ? siblingInfo!.siblingIds[siblingInfo!.currentIndex - 1] : null
+	);
+
+	function handleNext() {
+		if (nextSiblingId) {
+			onNavigateToSibling?.(nextSiblingId);
+		}
+	}
+
+	function handlePrevious() {
+		if (previousSiblingId) {
+			onNavigateToSibling?.(previousSiblingId);
+		}
+	}
+</script>
+
+{#if siblingInfo && siblingInfo.totalSiblings > 1}
+	<div
+		aria-label="Message version {siblingInfo.currentIndex + 1} of {siblingInfo.totalSiblings}"
+		class="flex items-center gap-1 text-xs text-muted-foreground {className}"
+		role="navigation"
+	>
+		<Tooltip.Root>
+			<Tooltip.Trigger>
+				<Button
+					aria-label="Previous message version"
+					class="h-5 w-5 p-0 {!hasPrevious ? 'cursor-not-allowed opacity-30' : ''}"
+					disabled={!hasPrevious}
+					onclick={handlePrevious}
+					size="sm"
+					variant="ghost"
+				>
+					<ChevronLeft class="h-3 w-3" />
+				</Button>
+			</Tooltip.Trigger>
+
+			<Tooltip.Content>
+				<p>Previous version</p>
+			</Tooltip.Content>
+		</Tooltip.Root>
+
+		<span class="px-1 font-mono text-xs">
+			{siblingInfo.currentIndex + 1}/{siblingInfo.totalSiblings}
+		</span>
+
+		<Tooltip.Root>
+			<Tooltip.Trigger>
+				<Button
+					aria-label="Next message version"
+					class="h-5 w-5 p-0 {!hasNext ? 'cursor-not-allowed opacity-30' : ''}"
+					disabled={!hasNext}
+					onclick={handleNext}
+					size="sm"
+					variant="ghost"
+				>
+					<ChevronRight class="h-3 w-3" />
+				</Button>
+			</Tooltip.Trigger>
+
+			<Tooltip.Content>
+				<p>Next version</p>
+			</Tooltip.Content>
+		</Tooltip.Root>
+	</div>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte
new file mode 100644
index 0000000..f812ea2
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte
@@ -0,0 +1,391 @@
+<script lang="ts">
+	import { X, ArrowUp, Paperclip, AlertTriangle } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+	import { Switch } from '$lib/components/ui/switch';
+	import { ChatAttachmentsList, DialogConfirmation, ModelsSelector } from '$lib/components/app';
+	import { INPUT_CLASSES } from '$lib/constants/input-classes';
+	import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
+	import { AttachmentType, FileTypeCategory, MimeTypeText } from '$lib/enums';
+	import { config } from '$lib/stores/settings.svelte';
+	import { useModelChangeValidation } from '$lib/hooks/use-model-change-validation.svelte';
+	import { setEditModeActive, clearEditMode } from '$lib/stores/chat.svelte';
+	import { conversationsStore } from '$lib/stores/conversations.svelte';
+	import { modelsStore } from '$lib/stores/models.svelte';
+	import { isRouterMode } from '$lib/stores/server.svelte';
+	import {
+		autoResizeTextarea,
+		getFileTypeCategory,
+		getFileTypeCategoryByExtension,
+		parseClipboardContent
+	} from '$lib/utils';
+
+	interface Props {
+		messageId: string;
+		editedContent: string;
+		editedExtras?: DatabaseMessageExtra[];
+		editedUploadedFiles?: ChatUploadedFile[];
+		originalContent: string;
+		originalExtras?: DatabaseMessageExtra[];
+		showSaveOnlyOption?: boolean;
+		onCancelEdit: () => void;
+		onSaveEdit: () => void;
+		onSaveEditOnly?: () => void;
+		onEditKeydown: (event: KeyboardEvent) => void;
+		onEditedContentChange: (content: string) => void;
+		onEditedExtrasChange?: (extras: DatabaseMessageExtra[]) => void;
+		onEditedUploadedFilesChange?: (files: ChatUploadedFile[]) => void;
+		textareaElement?: HTMLTextAreaElement;
+	}
+
+	let {
+		messageId,
+		editedContent,
+		editedExtras = [],
+		editedUploadedFiles = [],
+		originalContent,
+		originalExtras = [],
+		showSaveOnlyOption = false,
+		onCancelEdit,
+		onSaveEdit,
+		onSaveEditOnly,
+		onEditKeydown,
+		onEditedContentChange,
+		onEditedExtrasChange,
+		onEditedUploadedFilesChange,
+		textareaElement = $bindable()
+	}: Props = $props();
+
+	let fileInputElement: HTMLInputElement | undefined = $state();
+	let saveWithoutRegenerate = $state(false);
+	let showDiscardDialog = $state(false);
+	let isRouter = $derived(isRouterMode());
+	let currentConfig = $derived(config());
+
+	let pasteLongTextToFileLength = $derived.by(() => {
+		const n = Number(currentConfig.pasteLongTextToFileLen);
+
+		return Number.isNaN(n) ? Number(SETTING_CONFIG_DEFAULT.pasteLongTextToFileLen) : n;
+	});
+
+	let hasUnsavedChanges = $derived.by(() => {
+		if (editedContent !== originalContent) return true;
+		if (editedUploadedFiles.length > 0) return true;
+
+		const extrasChanged =
+			editedExtras.length !== originalExtras.length ||
+			editedExtras.some((extra, i) => extra !== originalExtras[i]);
+
+		if (extrasChanged) return true;
+
+		return false;
+	});
+
+	let hasAttachments = $derived(
+		(editedExtras && editedExtras.length > 0) ||
+			(editedUploadedFiles && editedUploadedFiles.length > 0)
+	);
+
+	let canSubmit = $derived(editedContent.trim().length > 0 || hasAttachments);
+
+	function getEditedAttachmentsModalities(): ModelModalities {
+		const modalities: ModelModalities = { vision: false, audio: false };
+
+		for (const extra of editedExtras) {
+			if (extra.type === AttachmentType.IMAGE) {
+				modalities.vision = true;
+			}
+
+			if (
+				extra.type === AttachmentType.PDF &&
+				'processedAsImages' in extra &&
+				extra.processedAsImages
+			) {
+				modalities.vision = true;
+			}
+
+			if (extra.type === AttachmentType.AUDIO) {
+				modalities.audio = true;
+			}
+		}
+
+		for (const file of editedUploadedFiles) {
+			const category = getFileTypeCategory(file.type) || getFileTypeCategoryByExtension(file.name);
+			if (category === FileTypeCategory.IMAGE) {
+				modalities.vision = true;
+			}
+			if (category === FileTypeCategory.AUDIO) {
+				modalities.audio = true;
+			}
+		}
+
+		return modalities;
+	}
+
+	function getRequiredModalities(): ModelModalities {
+		const beforeModalities = conversationsStore.getModalitiesUpToMessage(messageId);
+		const editedModalities = getEditedAttachmentsModalities();
+
+		return {
+			vision: beforeModalities.vision || editedModalities.vision,
+			audio: beforeModalities.audio || editedModalities.audio
+		};
+	}
+
+	const { handleModelChange } = useModelChangeValidation({
+		getRequiredModalities,
+		onValidationFailure: async (previousModelId) => {
+			if (previousModelId) {
+				await modelsStore.selectModelById(previousModelId);
+			}
+		}
+	});
+
+	function handleFileInputChange(event: Event) {
+		const input = event.target as HTMLInputElement;
+		if (!input.files || input.files.length === 0) return;
+
+		const files = Array.from(input.files);
+
+		processNewFiles(files);
+		input.value = '';
+	}
+
+	function handleGlobalKeydown(event: KeyboardEvent) {
+		if (event.key === 'Escape') {
+			event.preventDefault();
+			attemptCancel();
+		}
+	}
+
+	function attemptCancel() {
+		if (hasUnsavedChanges) {
+			showDiscardDialog = true;
+		} else {
+			onCancelEdit();
+		}
+	}
+
+	function handleRemoveExistingAttachment(index: number) {
+		if (!onEditedExtrasChange) return;
+
+		const newExtras = [...editedExtras];
+
+		newExtras.splice(index, 1);
+		onEditedExtrasChange(newExtras);
+	}
+
+	function handleRemoveUploadedFile(fileId: string) {
+		if (!onEditedUploadedFilesChange) return;
+
+		const newFiles = editedUploadedFiles.filter((f) => f.id !== fileId);
+
+		onEditedUploadedFilesChange(newFiles);
+	}
+
+	function handleSubmit() {
+		if (!canSubmit) return;
+
+		if (saveWithoutRegenerate && onSaveEditOnly) {
+			onSaveEditOnly();
+		} else {
+			onSaveEdit();
+		}
+
+		saveWithoutRegenerate = false;
+	}
+
+	async function processNewFiles(files: File[]) {
+		if (!onEditedUploadedFilesChange) return;
+
+		const { processFilesToChatUploaded } = await import('$lib/utils/browser-only');
+		const processed = await processFilesToChatUploaded(files);
+
+		onEditedUploadedFilesChange([...editedUploadedFiles, ...processed]);
+	}
+
+	function handlePaste(event: ClipboardEvent) {
+		if (!event.clipboardData) return;
+
+		const files = Array.from(event.clipboardData.items)
+			.filter((item) => item.kind === 'file')
+			.map((item) => item.getAsFile())
+			.filter((file): file is File => file !== null);
+
+		if (files.length > 0) {
+			event.preventDefault();
+			processNewFiles(files);
+
+			return;
+		}
+
+		const text = event.clipboardData.getData(MimeTypeText.PLAIN);
+
+		if (text.startsWith('"')) {
+			const parsed = parseClipboardContent(text);
+
+			if (parsed.textAttachments.length > 0) {
+				event.preventDefault();
+				onEditedContentChange(parsed.message);
+
+				const attachmentFiles = parsed.textAttachments.map(
+					(att) =>
+						new File([att.content], att.name, {
+							type: MimeTypeText.PLAIN
+						})
+				);
+
+				processNewFiles(attachmentFiles);
+
+				setTimeout(() => {
+					textareaElement?.focus();
+				}, 10);
+
+				return;
+			}
+		}
+
+		if (
+			text.length > 0 &&
+			pasteLongTextToFileLength > 0 &&
+			text.length > pasteLongTextToFileLength
+		) {
+			event.preventDefault();
+
+			const textFile = new File([text], 'Pasted', {
+				type: MimeTypeText.PLAIN
+			});
+
+			processNewFiles([textFile]);
+		}
+	}
+
+	$effect(() => {
+		if (textareaElement) {
+			autoResizeTextarea(textareaElement);
+		}
+	});
+
+	$effect(() => {
+		setEditModeActive(processNewFiles);
+
+		return () => {
+			clearEditMode();
+		};
+	});
+</script>
+
+<svelte:window onkeydown={handleGlobalKeydown} />
+
+<input
+	bind:this={fileInputElement}
+	type="file"
+	multiple
+	class="hidden"
+	onchange={handleFileInputChange}
+/>
+
+<div
+	class="{INPUT_CLASSES} w-full max-w-[80%] overflow-hidden rounded-3xl backdrop-blur-md"
+	data-slot="edit-form"
+>
+	<ChatAttachmentsList
+		attachments={editedExtras}
+		uploadedFiles={editedUploadedFiles}
+		readonly={false}
+		onFileRemove={(fileId) => {
+			if (fileId.startsWith('attachment-')) {
+				const index = parseInt(fileId.replace('attachment-', ''), 10);
+				if (!isNaN(index) && index >= 0 && index < editedExtras.length) {
+					handleRemoveExistingAttachment(index);
+				}
+			} else {
+				handleRemoveUploadedFile(fileId);
+			}
+		}}
+		limitToSingleRow
+		class="py-5"
+		style="scroll-padding: 1rem;"
+	/>
+
+	<div class="relative min-h-[48px] px-5 py-3">
+		<textarea
+			bind:this={textareaElement}
+			bind:value={editedContent}
+			class="field-sizing-content max-h-80 min-h-10 w-full resize-none bg-transparent text-sm outline-none"
+			onkeydown={onEditKeydown}
+			oninput={(e) => {
+				autoResizeTextarea(e.currentTarget);
+				onEditedContentChange(e.currentTarget.value);
+			}}
+			onpaste={handlePaste}
+			placeholder="Edit your message..."
+		></textarea>
+
+		<div class="flex w-full items-center gap-3" style="container-type: inline-size">
+			<Button
+				class="h-8 w-8 shrink-0 rounded-full bg-transparent p-0 text-muted-foreground hover:bg-foreground/10 hover:text-foreground"
+				onclick={() => fileInputElement?.click()}
+				type="button"
+				title="Add attachment"
+			>
+				<span class="sr-only">Attach files</span>
+
+				<Paperclip class="h-4 w-4" />
+			</Button>
+
+			<div class="flex-1"></div>
+
+			{#if isRouter}
+				<ModelsSelector
+					forceForegroundText={true}
+					useGlobalSelection={true}
+					onModelChange={handleModelChange}
+				/>
+			{/if}
+
+			<Button
+				class="h-8 w-8 shrink-0 rounded-full p-0"
+				onclick={handleSubmit}
+				disabled={!canSubmit}
+				type="button"
+				title={saveWithoutRegenerate ? 'Save changes' : 'Send and regenerate'}
+			>
+				<span class="sr-only">{saveWithoutRegenerate ? 'Save' : 'Send'}</span>
+
+				<ArrowUp class="h-5 w-5" />
+			</Button>
+		</div>
+	</div>
+</div>
+
+<div class="mt-2 flex w-full max-w-[80%] items-center justify-between">
+	{#if showSaveOnlyOption && onSaveEditOnly}
+		<div class="flex items-center gap-2">
+			<Switch id="save-only-switch" bind:checked={saveWithoutRegenerate} class="scale-75" />
+
+			<label for="save-only-switch" class="cursor-pointer text-xs text-muted-foreground">
+				Update without re-sending
+			</label>
+		</div>
+	{:else}
+		<div></div>
+	{/if}
+
+	<Button class="h-7 px-3 text-xs" onclick={attemptCancel} size="sm" variant="ghost">
+		<X class="mr-1 h-3 w-3" />
+
+		Cancel
+	</Button>
+</div>
+
+<DialogConfirmation
+	bind:open={showDiscardDialog}
+	title="Discard changes?"
+	description="You have unsaved changes. Are you sure you want to discard them?"
+	confirmText="Discard"
+	cancelText="Keep editing"
+	variant="destructive"
+	icon={AlertTriangle}
+	onConfirm={onCancelEdit}
+	onCancel={() => (showDiscardDialog = false)}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
new file mode 100644
index 0000000..24fe592
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
@@ -0,0 +1,175 @@
+<script lang="ts">
+	import { Clock, Gauge, WholeWord, BookOpenText, Sparkles } from '@lucide/svelte';
+	import { BadgeChatStatistic } from '$lib/components/app';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import { ChatMessageStatsView } from '$lib/enums';
+
+	interface Props {
+		predictedTokens?: number;
+		predictedMs?: number;
+		promptTokens?: number;
+		promptMs?: number;
+		// Live mode: when true, shows stats during streaming
+		isLive?: boolean;
+		// Whether prompt processing is still in progress
+		isProcessingPrompt?: boolean;
+		// Initial view to show (defaults to READING in live mode)
+		initialView?: ChatMessageStatsView;
+	}
+
+	let {
+		predictedTokens,
+		predictedMs,
+		promptTokens,
+		promptMs,
+		isLive = false,
+		isProcessingPrompt = false,
+		initialView = ChatMessageStatsView.GENERATION
+	}: Props = $props();
+
+	let activeView: ChatMessageStatsView = $state(initialView);
+	let hasAutoSwitchedToGeneration = $state(false);
+
+	// In live mode: auto-switch to GENERATION tab when prompt processing completes
+	$effect(() => {
+		if (isLive) {
+			// Auto-switch to generation tab only when prompt processing is done (once)
+			if (
+				!hasAutoSwitchedToGeneration &&
+				!isProcessingPrompt &&
+				predictedTokens &&
+				predictedTokens > 0
+			) {
+				activeView = ChatMessageStatsView.GENERATION;
+				hasAutoSwitchedToGeneration = true;
+			} else if (!hasAutoSwitchedToGeneration) {
+				// Stay on READING while prompt is still being processed
+				activeView = ChatMessageStatsView.READING;
+			}
+		}
+	});
+
+	let hasGenerationStats = $derived(
+		predictedTokens !== undefined &&
+			predictedTokens > 0 &&
+			predictedMs !== undefined &&
+			predictedMs > 0
+	);
+
+	let tokensPerSecond = $derived(hasGenerationStats ? (predictedTokens! / predictedMs!) * 1000 : 0);
+	let timeInSeconds = $derived(
+		predictedMs !== undefined ? (predictedMs / 1000).toFixed(2) : '0.00'
+	);
+
+	let promptTokensPerSecond = $derived(
+		promptTokens !== undefined && promptMs !== undefined && promptMs > 0
+			? (promptTokens / promptMs) * 1000
+			: undefined
+	);
+
+	let promptTimeInSeconds = $derived(
+		promptMs !== undefined ? (promptMs / 1000).toFixed(2) : undefined
+	);
+
+	let hasPromptStats = $derived(
+		promptTokens !== undefined &&
+			promptMs !== undefined &&
+			promptTokensPerSecond !== undefined &&
+			promptTimeInSeconds !== undefined
+	);
+
+	// In live mode, generation tab is disabled until we have generation stats
+	let isGenerationDisabled = $derived(isLive && !hasGenerationStats);
+</script>
+
+<div class="inline-flex items-center text-xs text-muted-foreground">
+	<div class="inline-flex items-center rounded-sm bg-muted-foreground/15 p-0.5">
+		{#if hasPromptStats || isLive}
+			<Tooltip.Root>
+				<Tooltip.Trigger>
+					<button
+						type="button"
+						class="inline-flex h-5 w-5 items-center justify-center rounded-sm transition-colors {activeView ===
+						ChatMessageStatsView.READING
+							? 'bg-background text-foreground shadow-sm'
+							: 'hover:text-foreground'}"
+						onclick={() => (activeView = ChatMessageStatsView.READING)}
+					>
+						<BookOpenText class="h-3 w-3" />
+						<span class="sr-only">Reading</span>
+					</button>
+				</Tooltip.Trigger>
+				<Tooltip.Content>
+					<p>Reading (prompt processing)</p>
+				</Tooltip.Content>
+			</Tooltip.Root>
+		{/if}
+		<Tooltip.Root>
+			<Tooltip.Trigger>
+				<button
+					type="button"
+					class="inline-flex h-5 w-5 items-center justify-center rounded-sm transition-colors {activeView ===
+					ChatMessageStatsView.GENERATION
+						? 'bg-background text-foreground shadow-sm'
+						: isGenerationDisabled
+							? 'cursor-not-allowed opacity-40'
+							: 'hover:text-foreground'}"
+					onclick={() => !isGenerationDisabled && (activeView = ChatMessageStatsView.GENERATION)}
+					disabled={isGenerationDisabled}
+				>
+					<Sparkles class="h-3 w-3" />
+					<span class="sr-only">Generation</span>
+				</button>
+			</Tooltip.Trigger>
+			<Tooltip.Content>
+				<p>
+					{isGenerationDisabled
+						? 'Generation (waiting for tokens...)'
+						: 'Generation (token output)'}
+				</p>
+			</Tooltip.Content>
+		</Tooltip.Root>
+	</div>
+
+	<div class="flex items-center gap-1 px-2">
+		{#if activeView === ChatMessageStatsView.GENERATION && hasGenerationStats}
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={WholeWord}
+				value="{predictedTokens?.toLocaleString()} tokens"
+				tooltipLabel="Generated tokens"
+			/>
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={Clock}
+				value="{timeInSeconds}s"
+				tooltipLabel="Generation time"
+			/>
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={Gauge}
+				value="{tokensPerSecond.toFixed(2)} tokens/s"
+				tooltipLabel="Generation speed"
+			/>
+		{:else if hasPromptStats}
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={WholeWord}
+				value="{promptTokens} tokens"
+				tooltipLabel="Prompt tokens"
+			/>
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={Clock}
+				value="{promptTimeInSeconds}s"
+				tooltipLabel="Prompt processing time"
+			/>
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={Gauge}
+				value="{promptTokensPerSecond!.toFixed(2)} tokens/s"
+				tooltipLabel="Prompt processing speed"
+			/>
+		{/if}
+	</div>
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte
new file mode 100644
index 0000000..c203822
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte
@@ -0,0 +1,216 @@
+<script lang="ts">
+	import { Check, X } from '@lucide/svelte';
+	import { Card } from '$lib/components/ui/card';
+	import { Button } from '$lib/components/ui/button';
+	import { MarkdownContent } from '$lib/components/app';
+	import { INPUT_CLASSES } from '$lib/constants/input-classes';
+	import { config } from '$lib/stores/settings.svelte';
+	import ChatMessageActions from './ChatMessageActions.svelte';
+
+	interface Props {
+		class?: string;
+		message: DatabaseMessage;
+		isEditing: boolean;
+		editedContent: string;
+		siblingInfo?: ChatMessageSiblingInfo | null;
+		showDeleteDialog: boolean;
+		deletionInfo: {
+			totalCount: number;
+			userMessages: number;
+			assistantMessages: number;
+			messageTypes: string[];
+		} | null;
+		onCancelEdit: () => void;
+		onSaveEdit: () => void;
+		onEditKeydown: (event: KeyboardEvent) => void;
+		onEditedContentChange: (content: string) => void;
+		onCopy: () => void;
+		onEdit: () => void;
+		onDelete: () => void;
+		onConfirmDelete: () => void;
+		onNavigateToSibling?: (siblingId: string) => void;
+		onShowDeleteDialogChange: (show: boolean) => void;
+		textareaElement?: HTMLTextAreaElement;
+	}
+
+	let {
+		class: className = '',
+		message,
+		isEditing,
+		editedContent,
+		siblingInfo = null,
+		showDeleteDialog,
+		deletionInfo,
+		onCancelEdit,
+		onSaveEdit,
+		onEditKeydown,
+		onEditedContentChange,
+		onCopy,
+		onEdit,
+		onDelete,
+		onConfirmDelete,
+		onNavigateToSibling,
+		onShowDeleteDialogChange,
+		textareaElement = $bindable()
+	}: Props = $props();
+
+	let isMultiline = $state(false);
+	let messageElement: HTMLElement | undefined = $state();
+	let isExpanded = $state(false);
+	let contentHeight = $state(0);
+	const MAX_HEIGHT = 200; // pixels
+	const currentConfig = config();
+
+	let showExpandButton = $derived(contentHeight > MAX_HEIGHT);
+
+	$effect(() => {
+		if (!messageElement || !message.content.trim()) return;
+
+		if (message.content.includes('\n')) {
+			isMultiline = true;
+		}
+
+		const resizeObserver = new ResizeObserver((entries) => {
+			for (const entry of entries) {
+				const element = entry.target as HTMLElement;
+				const estimatedSingleLineHeight = 24;
+
+				isMultiline = element.offsetHeight > estimatedSingleLineHeight * 1.5;
+				contentHeight = element.scrollHeight;
+			}
+		});
+
+		resizeObserver.observe(messageElement);
+
+		return () => {
+			resizeObserver.disconnect();
+		};
+	});
+
+	function toggleExpand() {
+		isExpanded = !isExpanded;
+	}
+</script>
+
+<div
+	aria-label="System message with actions"
+	class="group flex flex-col items-end gap-3 md:gap-2 {className}"
+	role="group"
+>
+	{#if isEditing}
+		<div class="w-full max-w-[80%]">
+			<textarea
+				bind:this={textareaElement}
+				bind:value={editedContent}
+				class="min-h-[60px] w-full resize-none rounded-2xl px-3 py-2 text-sm {INPUT_CLASSES}"
+				onkeydown={onEditKeydown}
+				oninput={(e) => onEditedContentChange(e.currentTarget.value)}
+				placeholder="Edit system message..."
+			></textarea>
+
+			<div class="mt-2 flex justify-end gap-2">
+				<Button class="h-8 px-3" onclick={onCancelEdit} size="sm" variant="outline">
+					<X class="mr-1 h-3 w-3" />
+					Cancel
+				</Button>
+
+				<Button class="h-8 px-3" onclick={onSaveEdit} disabled={!editedContent.trim()} size="sm">
+					<Check class="mr-1 h-3 w-3" />
+					Send
+				</Button>
+			</div>
+		</div>
+	{:else}
+		{#if message.content.trim()}
+			<div class="relative max-w-[80%]">
+				<button
+					class="group/expand w-full text-left {!isExpanded && showExpandButton
+						? 'cursor-pointer'
+						: 'cursor-auto'}"
+					onclick={showExpandButton && !isExpanded ? toggleExpand : undefined}
+					type="button"
+				>
+					<Card
+						class="rounded-[1.125rem] !border-2 !border-dashed !border-border/50 bg-muted px-3.75 py-1.5 data-[multiline]:py-2.5"
+						data-multiline={isMultiline ? '' : undefined}
+						style="border: 2px dashed hsl(var(--border));"
+					>
+						<div
+							class="relative overflow-hidden transition-all duration-300 {isExpanded
+								? 'cursor-text select-text'
+								: 'select-none'}"
+							style={!isExpanded && showExpandButton
+								? `max-height: ${MAX_HEIGHT}px;`
+								: 'max-height: none;'}
+						>
+							{#if currentConfig.renderUserContentAsMarkdown}
+								<div bind:this={messageElement} class="text-md {isExpanded ? 'cursor-text' : ''}">
+									<MarkdownContent class="markdown-system-content" content={message.content} />
+								</div>
+							{:else}
+								<span
+									bind:this={messageElement}
+									class="text-md whitespace-pre-wrap {isExpanded ? 'cursor-text' : ''}"
+								>
+									{message.content}
+								</span>
+							{/if}
+
+							{#if !isExpanded && showExpandButton}
+								<div
+									class="pointer-events-none absolute right-0 bottom-0 left-0 h-48 bg-gradient-to-t from-muted to-transparent"
+								></div>
+								<div
+									class="pointer-events-none absolute right-0 bottom-4 left-0 flex justify-center opacity-0 transition-opacity group-hover/expand:opacity-100"
+								>
+									<Button
+										class="rounded-full px-4 py-1.5 text-xs shadow-md"
+										size="sm"
+										variant="outline"
+									>
+										Show full system message
+									</Button>
+								</div>
+							{/if}
+						</div>
+
+						{#if isExpanded && showExpandButton}
+							<div class="mb-2 flex justify-center">
+								<Button
+									class="rounded-full px-4 py-1.5 text-xs"
+									onclick={(e) => {
+										e.stopPropagation();
+										toggleExpand();
+									}}
+									size="sm"
+									variant="outline"
+								>
+									Collapse System Message
+								</Button>
+							</div>
+						{/if}
+					</Card>
+				</button>
+			</div>
+		{/if}
+
+		{#if message.timestamp}
+			<div class="max-w-[80%]">
+				<ChatMessageActions
+					actionsPosition="right"
+					{deletionInfo}
+					justify="end"
+					{onConfirmDelete}
+					{onCopy}
+					{onDelete}
+					{onEdit}
+					{onNavigateToSibling}
+					{onShowDeleteDialogChange}
+					{siblingInfo}
+					{showDeleteDialog}
+					role="user"
+				/>
+			</div>
+		{/if}
+	{/if}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageThinkingBlock.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageThinkingBlock.svelte
new file mode 100644
index 0000000..9245ad5
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageThinkingBlock.svelte
@@ -0,0 +1,68 @@
+<script lang="ts">
+	import { Brain } from '@lucide/svelte';
+	import ChevronsUpDownIcon from '@lucide/svelte/icons/chevrons-up-down';
+	import * as Collapsible from '$lib/components/ui/collapsible/index.js';
+	import { buttonVariants } from '$lib/components/ui/button/index.js';
+	import { Card } from '$lib/components/ui/card';
+	import { config } from '$lib/stores/settings.svelte';
+
+	interface Props {
+		class?: string;
+		hasRegularContent?: boolean;
+		isStreaming?: boolean;
+		reasoningContent: string | null;
+	}
+
+	let {
+		class: className = '',
+		hasRegularContent = false,
+		isStreaming = false,
+		reasoningContent
+	}: Props = $props();
+
+	const currentConfig = config();
+
+	let isExpanded = $state(currentConfig.showThoughtInProgress);
+
+	$effect(() => {
+		if (hasRegularContent && reasoningContent && currentConfig.showThoughtInProgress) {
+			isExpanded = false;
+		}
+	});
+</script>
+
+<Collapsible.Root bind:open={isExpanded} class="mb-6 {className}">
+	<Card class="gap-0 border-muted bg-muted/30 py-0">
+		<Collapsible.Trigger class="flex cursor-pointer items-center justify-between p-3">
+			<div class="flex items-center gap-2 text-muted-foreground">
+				<Brain class="h-4 w-4" />
+
+				<span class="text-sm font-medium">
+					{isStreaming ? 'Reasoning...' : 'Reasoning'}
+				</span>
+			</div>
+
+			<div
+				class={buttonVariants({
+					variant: 'ghost',
+					size: 'sm',
+					class: 'h-6 w-6 p-0 text-muted-foreground hover:text-foreground'
+				})}
+			>
+				<ChevronsUpDownIcon class="h-4 w-4" />
+
+				<span class="sr-only">Toggle reasoning content</span>
+			</div>
+		</Collapsible.Trigger>
+
+		<Collapsible.Content>
+			<div class="border-t border-muted px-3 pb-3">
+				<div class="pt-3">
+					<div class="text-xs leading-relaxed break-words whitespace-pre-wrap">
+						{reasoningContent ?? ''}
+					</div>
+				</div>
+			</div>
+		</Collapsible.Content>
+	</Card>
+</Collapsible.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte
new file mode 100644
index 0000000..041c6bd
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte
@@ -0,0 +1,163 @@
+<script lang="ts">
+	import { Card } from '$lib/components/ui/card';
+	import { ChatAttachmentsList, MarkdownContent } from '$lib/components/app';
+	import { config } from '$lib/stores/settings.svelte';
+	import ChatMessageActions from './ChatMessageActions.svelte';
+	import ChatMessageEditForm from './ChatMessageEditForm.svelte';
+
+	interface Props {
+		class?: string;
+		message: DatabaseMessage;
+		isEditing: boolean;
+		editedContent: string;
+		editedExtras?: DatabaseMessageExtra[];
+		editedUploadedFiles?: ChatUploadedFile[];
+		siblingInfo?: ChatMessageSiblingInfo | null;
+		showDeleteDialog: boolean;
+		deletionInfo: {
+			totalCount: number;
+			userMessages: number;
+			assistantMessages: number;
+			messageTypes: string[];
+		} | null;
+		onCancelEdit: () => void;
+		onSaveEdit: () => void;
+		onSaveEditOnly?: () => void;
+		onEditKeydown: (event: KeyboardEvent) => void;
+		onEditedContentChange: (content: string) => void;
+		onEditedExtrasChange?: (extras: DatabaseMessageExtra[]) => void;
+		onEditedUploadedFilesChange?: (files: ChatUploadedFile[]) => void;
+		onCopy: () => void;
+		onEdit: () => void;
+		onDelete: () => void;
+		onConfirmDelete: () => void;
+		onNavigateToSibling?: (siblingId: string) => void;
+		onShowDeleteDialogChange: (show: boolean) => void;
+		textareaElement?: HTMLTextAreaElement;
+	}
+
+	let {
+		class: className = '',
+		message,
+		isEditing,
+		editedContent,
+		editedExtras = [],
+		editedUploadedFiles = [],
+		siblingInfo = null,
+		showDeleteDialog,
+		deletionInfo,
+		onCancelEdit,
+		onSaveEdit,
+		onSaveEditOnly,
+		onEditKeydown,
+		onEditedContentChange,
+		onEditedExtrasChange,
+		onEditedUploadedFilesChange,
+		onCopy,
+		onEdit,
+		onDelete,
+		onConfirmDelete,
+		onNavigateToSibling,
+		onShowDeleteDialogChange,
+		textareaElement = $bindable()
+	}: Props = $props();
+
+	let isMultiline = $state(false);
+	let messageElement: HTMLElement | undefined = $state();
+	const currentConfig = config();
+
+	$effect(() => {
+		if (!messageElement || !message.content.trim()) return;
+
+		if (message.content.includes('\n')) {
+			isMultiline = true;
+			return;
+		}
+
+		const resizeObserver = new ResizeObserver((entries) => {
+			for (const entry of entries) {
+				const element = entry.target as HTMLElement;
+				const estimatedSingleLineHeight = 24; // Typical line height for text-md
+
+				isMultiline = element.offsetHeight > estimatedSingleLineHeight * 1.5;
+			}
+		});
+
+		resizeObserver.observe(messageElement);
+
+		return () => {
+			resizeObserver.disconnect();
+		};
+	});
+</script>
+
+<div
+	aria-label="User message with actions"
+	class="group flex flex-col items-end gap-3 md:gap-2 {className}"
+	role="group"
+>
+	{#if isEditing}
+		<ChatMessageEditForm
+			bind:textareaElement
+			messageId={message.id}
+			{editedContent}
+			{editedExtras}
+			{editedUploadedFiles}
+			originalContent={message.content}
+			originalExtras={message.extra}
+			showSaveOnlyOption={!!onSaveEditOnly}
+			{onCancelEdit}
+			{onSaveEdit}
+			{onSaveEditOnly}
+			{onEditKeydown}
+			{onEditedContentChange}
+			{onEditedExtrasChange}
+			{onEditedUploadedFilesChange}
+		/>
+	{:else}
+		{#if message.extra && message.extra.length > 0}
+			<div class="mb-2 max-w-[80%]">
+				<ChatAttachmentsList attachments={message.extra} readonly={true} imageHeight="h-80" />
+			</div>
+		{/if}
+
+		{#if message.content.trim()}
+			<Card
+				class="max-w-[80%] rounded-[1.125rem] border-none bg-primary px-3.75 py-1.5 text-primary-foreground data-[multiline]:py-2.5"
+				data-multiline={isMultiline ? '' : undefined}
+			>
+				{#if currentConfig.renderUserContentAsMarkdown}
+					<div bind:this={messageElement} class="text-md">
+						<MarkdownContent
+							class="markdown-user-content text-primary-foreground"
+							content={message.content}
+						/>
+					</div>
+				{:else}
+					<span bind:this={messageElement} class="text-md whitespace-pre-wrap">
+						{message.content}
+					</span>
+				{/if}
+			</Card>
+		{/if}
+
+		{#if message.timestamp}
+			<div class="max-w-[80%]">
+				<ChatMessageActions
+					actionsPosition="right"
+					{deletionInfo}
+					justify="end"
+					{onConfirmDelete}
+					{onCopy}
+					{onDelete}
+					{onEdit}
+					{onNavigateToSibling}
+					{onShowDeleteDialogChange}
+					{siblingInfo}
+					{showDeleteDialog}
+					role="user"
+				/>
+			</div>
+		{/if}
+	{/if}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
new file mode 100644
index 0000000..c203f10
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
@@ -0,0 +1,143 @@
+<script lang="ts">
+	import { ChatMessage } from '$lib/components/app';
+	import { chatStore } from '$lib/stores/chat.svelte';
+	import { conversationsStore, activeConversation } from '$lib/stores/conversations.svelte';
+	import { config } from '$lib/stores/settings.svelte';
+	import { getMessageSiblings } from '$lib/utils';
+
+	interface Props {
+		class?: string;
+		messages?: DatabaseMessage[];
+		onUserAction?: () => void;
+	}
+
+	let { class: className, messages = [], onUserAction }: Props = $props();
+
+	let allConversationMessages = $state<DatabaseMessage[]>([]);
+	const currentConfig = config();
+
+	function refreshAllMessages() {
+		const conversation = activeConversation();
+
+		if (conversation) {
+			conversationsStore.getConversationMessages(conversation.id).then((messages) => {
+				allConversationMessages = messages;
+			});
+		} else {
+			allConversationMessages = [];
+		}
+	}
+
+	// Single effect that tracks both conversation and message changes
+	$effect(() => {
+		const conversation = activeConversation();
+
+		if (conversation) {
+			refreshAllMessages();
+		}
+	});
+
+	let displayMessages = $derived.by(() => {
+		if (!messages.length) {
+			return [];
+		}
+
+		// Filter out system messages if showSystemMessage is false
+		const filteredMessages = currentConfig.showSystemMessage
+			? messages
+			: messages.filter((msg) => msg.type !== 'system');
+
+		return filteredMessages.map((message) => {
+			const siblingInfo = getMessageSiblings(allConversationMessages, message.id);
+
+			return {
+				message,
+				siblingInfo: siblingInfo || {
+					message,
+					siblingIds: [message.id],
+					currentIndex: 0,
+					totalSiblings: 1
+				}
+			};
+		});
+	});
+
+	async function handleNavigateToSibling(siblingId: string) {
+		await conversationsStore.navigateToSibling(siblingId);
+	}
+
+	async function handleEditWithBranching(
+		message: DatabaseMessage,
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
+	) {
+		onUserAction?.();
+
+		await chatStore.editMessageWithBranching(message.id, newContent, newExtras);
+
+		refreshAllMessages();
+	}
+
+	async function handleEditWithReplacement(
+		message: DatabaseMessage,
+		newContent: string,
+		shouldBranch: boolean
+	) {
+		onUserAction?.();
+
+		await chatStore.editAssistantMessage(message.id, newContent, shouldBranch);
+
+		refreshAllMessages();
+	}
+
+	async function handleRegenerateWithBranching(message: DatabaseMessage, modelOverride?: string) {
+		onUserAction?.();
+
+		await chatStore.regenerateMessageWithBranching(message.id, modelOverride);
+
+		refreshAllMessages();
+	}
+
+	async function handleContinueAssistantMessage(message: DatabaseMessage) {
+		onUserAction?.();
+
+		await chatStore.continueAssistantMessage(message.id);
+
+		refreshAllMessages();
+	}
+
+	async function handleEditUserMessagePreserveResponses(
+		message: DatabaseMessage,
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
+	) {
+		onUserAction?.();
+
+		await chatStore.editUserMessagePreserveResponses(message.id, newContent, newExtras);
+
+		refreshAllMessages();
+	}
+
+	async function handleDeleteMessage(message: DatabaseMessage) {
+		await chatStore.deleteMessage(message.id);
+
+		refreshAllMessages();
+	}
+</script>
+
+<div class="flex h-full flex-col space-y-10 pt-16 md:pt-24 {className}" style="height: auto; ">
+	{#each displayMessages as { message, siblingInfo } (message.id)}
+		<ChatMessage
+			class="mx-auto w-full max-w-[48rem]"
+			{message}
+			{siblingInfo}
+			onDelete={handleDeleteMessage}
+			onNavigateToSibling={handleNavigateToSibling}
+			onEditWithBranching={handleEditWithBranching}
+			onEditWithReplacement={handleEditWithReplacement}
+			onEditUserMessagePreserveResponses={handleEditUserMessagePreserveResponses}
+			onRegenerateWithBranching={handleRegenerateWithBranching}
+			onContinueAssistantMessage={handleContinueAssistantMessage}
+		/>
+	{/each}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
new file mode 100644
index 0000000..2743955
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
@@ -0,0 +1,617 @@
+<script lang="ts">
+	import { afterNavigate } from '$app/navigation';
+	import {
+		ChatForm,
+		ChatScreenHeader,
+		ChatMessages,
+		ChatScreenProcessingInfo,
+		DialogEmptyFileAlert,
+		DialogChatError,
+		ServerLoadingSplash,
+		DialogConfirmation
+	} from '$lib/components/app';
+	import * as Alert from '$lib/components/ui/alert';
+	import * as AlertDialog from '$lib/components/ui/alert-dialog';
+	import {
+		AUTO_SCROLL_AT_BOTTOM_THRESHOLD,
+		AUTO_SCROLL_INTERVAL,
+		INITIAL_SCROLL_DELAY
+	} from '$lib/constants/auto-scroll';
+	import {
+		chatStore,
+		errorDialog,
+		isLoading,
+		isEditing,
+		getAddFilesHandler
+	} from '$lib/stores/chat.svelte';
+	import {
+		conversationsStore,
+		activeMessages,
+		activeConversation
+	} from '$lib/stores/conversations.svelte';
+	import { config } from '$lib/stores/settings.svelte';
+	import { serverLoading, serverError, serverStore, isRouterMode } from '$lib/stores/server.svelte';
+	import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
+	import { isFileTypeSupported, filterFilesByModalities } from '$lib/utils';
+	import { parseFilesToMessageExtras, processFilesToChatUploaded } from '$lib/utils/browser-only';
+	import { onMount } from 'svelte';
+	import { fade, fly, slide } from 'svelte/transition';
+	import { Trash2, AlertTriangle, RefreshCw } from '@lucide/svelte';
+	import ChatScreenDragOverlay from './ChatScreenDragOverlay.svelte';
+
+	let { showCenteredEmpty = false } = $props();
+
+	let disableAutoScroll = $derived(Boolean(config().disableAutoScroll));
+	let autoScrollEnabled = $state(true);
+	let chatScrollContainer: HTMLDivElement | undefined = $state();
+	let dragCounter = $state(0);
+	let isDragOver = $state(false);
+	let lastScrollTop = $state(0);
+	let scrollInterval: ReturnType<typeof setInterval> | undefined;
+	let scrollTimeout: ReturnType<typeof setTimeout> | undefined;
+	let showFileErrorDialog = $state(false);
+	let uploadedFiles = $state<ChatUploadedFile[]>([]);
+	let userScrolledUp = $state(false);
+
+	let fileErrorData = $state<{
+		generallyUnsupported: File[];
+		modalityUnsupported: File[];
+		modalityReasons: Record<string, string>;
+		supportedTypes: string[];
+	}>({
+		generallyUnsupported: [],
+		modalityUnsupported: [],
+		modalityReasons: {},
+		supportedTypes: []
+	});
+
+	let showDeleteDialog = $state(false);
+
+	let showEmptyFileDialog = $state(false);
+
+	let emptyFileNames = $state<string[]>([]);
+
+	let isEmpty = $derived(
+		showCenteredEmpty && !activeConversation() && activeMessages().length === 0 && !isLoading()
+	);
+
+	let activeErrorDialog = $derived(errorDialog());
+	let isServerLoading = $derived(serverLoading());
+	let hasPropsError = $derived(!!serverError());
+
+	let isCurrentConversationLoading = $derived(isLoading());
+
+	let isRouter = $derived(isRouterMode());
+
+	let conversationModel = $derived(
+		chatStore.getConversationModel(activeMessages() as DatabaseMessage[])
+	);
+
+	let activeModelId = $derived.by(() => {
+		const options = modelOptions();
+
+		if (!isRouter) {
+			return options.length > 0 ? options[0].model : null;
+		}
+
+		const selectedId = selectedModelId();
+		if (selectedId) {
+			const model = options.find((m) => m.id === selectedId);
+			if (model) return model.model;
+		}
+
+		if (conversationModel) {
+			const model = options.find((m) => m.model === conversationModel);
+			if (model) return model.model;
+		}
+
+		return null;
+	});
+
+	let modelPropsVersion = $state(0);
+
+	$effect(() => {
+		if (activeModelId) {
+			const cached = modelsStore.getModelProps(activeModelId);
+			if (!cached) {
+				modelsStore.fetchModelProps(activeModelId).then(() => {
+					modelPropsVersion++;
+				});
+			}
+		}
+	});
+
+	let hasAudioModality = $derived.by(() => {
+		if (activeModelId) {
+			void modelPropsVersion;
+			return modelsStore.modelSupportsAudio(activeModelId);
+		}
+
+		return false;
+	});
+
+	let hasVisionModality = $derived.by(() => {
+		if (activeModelId) {
+			void modelPropsVersion;
+
+			return modelsStore.modelSupportsVision(activeModelId);
+		}
+
+		return false;
+	});
+
+	async function handleDeleteConfirm() {
+		const conversation = activeConversation();
+
+		if (conversation) {
+			await conversationsStore.deleteConversation(conversation.id);
+		}
+
+		showDeleteDialog = false;
+	}
+
+	function handleDragEnter(event: DragEvent) {
+		event.preventDefault();
+
+		dragCounter++;
+
+		if (event.dataTransfer?.types.includes('Files')) {
+			isDragOver = true;
+		}
+	}
+
+	function handleDragLeave(event: DragEvent) {
+		event.preventDefault();
+
+		dragCounter--;
+
+		if (dragCounter === 0) {
+			isDragOver = false;
+		}
+	}
+
+	function handleErrorDialogOpenChange(open: boolean) {
+		if (!open) {
+			chatStore.dismissErrorDialog();
+		}
+	}
+
+	function handleDragOver(event: DragEvent) {
+		event.preventDefault();
+	}
+
+	function handleDrop(event: DragEvent) {
+		event.preventDefault();
+
+		isDragOver = false;
+		dragCounter = 0;
+
+		if (event.dataTransfer?.files) {
+			const files = Array.from(event.dataTransfer.files);
+
+			if (isEditing()) {
+				const handler = getAddFilesHandler();
+
+				if (handler) {
+					handler(files);
+					return;
+				}
+			}
+
+			processFiles(files);
+		}
+	}
+
+	function handleFileRemove(fileId: string) {
+		uploadedFiles = uploadedFiles.filter((f) => f.id !== fileId);
+	}
+
+	function handleFileUpload(files: File[]) {
+		processFiles(files);
+	}
+
+	function handleKeydown(event: KeyboardEvent) {
+		const isCtrlOrCmd = event.ctrlKey || event.metaKey;
+
+		if (isCtrlOrCmd && event.shiftKey && (event.key === 'd' || event.key === 'D')) {
+			event.preventDefault();
+			if (activeConversation()) {
+				showDeleteDialog = true;
+			}
+		}
+	}
+
+	function handleScroll() {
+		if (disableAutoScroll || !chatScrollContainer) return;
+
+		const { scrollTop, scrollHeight, clientHeight } = chatScrollContainer;
+		const distanceFromBottom = scrollHeight - scrollTop - clientHeight;
+		const isAtBottom = distanceFromBottom < AUTO_SCROLL_AT_BOTTOM_THRESHOLD;
+
+		if (scrollTop < lastScrollTop && !isAtBottom) {
+			userScrolledUp = true;
+			autoScrollEnabled = false;
+		} else if (isAtBottom && userScrolledUp) {
+			userScrolledUp = false;
+			autoScrollEnabled = true;
+		}
+
+		if (scrollTimeout) {
+			clearTimeout(scrollTimeout);
+		}
+
+		scrollTimeout = setTimeout(() => {
+			if (isAtBottom) {
+				userScrolledUp = false;
+				autoScrollEnabled = true;
+			}
+		}, AUTO_SCROLL_INTERVAL);
+
+		lastScrollTop = scrollTop;
+	}
+
+	async function handleSendMessage(message: string, files?: ChatUploadedFile[]): Promise<boolean> {
+		const result = files
+			? await parseFilesToMessageExtras(files, activeModelId ?? undefined)
+			: undefined;
+
+		if (result?.emptyFiles && result.emptyFiles.length > 0) {
+			emptyFileNames = result.emptyFiles;
+			showEmptyFileDialog = true;
+
+			if (files) {
+				const emptyFileNamesSet = new Set(result.emptyFiles);
+				uploadedFiles = uploadedFiles.filter((file) => !emptyFileNamesSet.has(file.name));
+			}
+			return false;
+		}
+
+		const extras = result?.extras;
+
+		// Enable autoscroll for user-initiated message sending
+		if (!disableAutoScroll) {
+			userScrolledUp = false;
+			autoScrollEnabled = true;
+		}
+		await chatStore.sendMessage(message, extras);
+		scrollChatToBottom();
+
+		return true;
+	}
+
+	async function processFiles(files: File[]) {
+		const generallySupported: File[] = [];
+		const generallyUnsupported: File[] = [];
+
+		for (const file of files) {
+			if (isFileTypeSupported(file.name, file.type)) {
+				generallySupported.push(file);
+			} else {
+				generallyUnsupported.push(file);
+			}
+		}
+
+		// Use model-specific capabilities for file validation
+		const capabilities = { hasVision: hasVisionModality, hasAudio: hasAudioModality };
+		const { supportedFiles, unsupportedFiles, modalityReasons } = filterFilesByModalities(
+			generallySupported,
+			capabilities
+		);
+
+		const allUnsupportedFiles = [...generallyUnsupported, ...unsupportedFiles];
+
+		if (allUnsupportedFiles.length > 0) {
+			const supportedTypes: string[] = ['text files', 'PDFs'];
+
+			if (hasVisionModality) supportedTypes.push('images');
+			if (hasAudioModality) supportedTypes.push('audio files');
+
+			fileErrorData = {
+				generallyUnsupported,
+				modalityUnsupported: unsupportedFiles,
+				modalityReasons,
+				supportedTypes
+			};
+			showFileErrorDialog = true;
+		}
+
+		if (supportedFiles.length > 0) {
+			const processed = await processFilesToChatUploaded(
+				supportedFiles,
+				activeModelId ?? undefined
+			);
+			uploadedFiles = [...uploadedFiles, ...processed];
+		}
+	}
+
+	function scrollChatToBottom(behavior: ScrollBehavior = 'smooth') {
+		if (disableAutoScroll) return;
+
+		chatScrollContainer?.scrollTo({
+			top: chatScrollContainer?.scrollHeight,
+			behavior
+		});
+	}
+
+	afterNavigate(() => {
+		if (!disableAutoScroll) {
+			setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY);
+		}
+	});
+
+	onMount(() => {
+		if (!disableAutoScroll) {
+			setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY);
+		}
+	});
+
+	$effect(() => {
+		if (disableAutoScroll) {
+			autoScrollEnabled = false;
+			if (scrollInterval) {
+				clearInterval(scrollInterval);
+				scrollInterval = undefined;
+			}
+			return;
+		}
+
+		if (isCurrentConversationLoading && autoScrollEnabled) {
+			scrollInterval = setInterval(scrollChatToBottom, AUTO_SCROLL_INTERVAL);
+		} else if (scrollInterval) {
+			clearInterval(scrollInterval);
+			scrollInterval = undefined;
+		}
+	});
+</script>
+
+{#if isDragOver}
+	<ChatScreenDragOverlay />
+{/if}
+
+<svelte:window onkeydown={handleKeydown} />
+
+<ChatScreenHeader />
+
+{#if !isEmpty}
+	<div
+		bind:this={chatScrollContainer}
+		aria-label="Chat interface with file drop zone"
+		class="flex h-full flex-col overflow-y-auto px-4 md:px-6"
+		ondragenter={handleDragEnter}
+		ondragleave={handleDragLeave}
+		ondragover={handleDragOver}
+		ondrop={handleDrop}
+		onscroll={handleScroll}
+		role="main"
+	>
+		<ChatMessages
+			class="mb-16 md:mb-24"
+			messages={activeMessages()}
+			onUserAction={() => {
+				if (!disableAutoScroll) {
+					userScrolledUp = false;
+					autoScrollEnabled = true;
+					scrollChatToBottom();
+				}
+			}}
+		/>
+
+		<div
+			class="pointer-events-none sticky right-0 bottom-0 left-0 mt-auto"
+			in:slide={{ duration: 150, axis: 'y' }}
+		>
+			<ChatScreenProcessingInfo />
+
+			{#if hasPropsError}
+				<div
+					class="pointer-events-auto mx-auto mb-4 max-w-[48rem] px-1"
+					in:fly={{ y: 10, duration: 250 }}
+				>
+					<Alert.Root variant="destructive">
+						<AlertTriangle class="h-4 w-4" />
+						<Alert.Title class="flex items-center justify-between">
+							<span>Server unavailable</span>
+							<button
+								onclick={() => serverStore.fetch()}
+								disabled={isServerLoading}
+								class="flex items-center gap-1.5 rounded-lg bg-destructive/20 px-2 py-1 text-xs font-medium hover:bg-destructive/30 disabled:opacity-50"
+							>
+								<RefreshCw class="h-3 w-3 {isServerLoading ? 'animate-spin' : ''}" />
+								{isServerLoading ? 'Retrying...' : 'Retry'}
+							</button>
+						</Alert.Title>
+						<Alert.Description>{serverError()}</Alert.Description>
+					</Alert.Root>
+				</div>
+			{/if}
+
+			<div class="conversation-chat-form pointer-events-auto rounded-t-3xl pb-4">
+				<ChatForm
+					disabled={hasPropsError || isEditing()}
+					isLoading={isCurrentConversationLoading}
+					onFileRemove={handleFileRemove}
+					onFileUpload={handleFileUpload}
+					onSend={handleSendMessage}
+					onStop={() => chatStore.stopGeneration()}
+					showHelperText={false}
+					bind:uploadedFiles
+				/>
+			</div>
+		</div>
+	</div>
+{:else if isServerLoading}
+	<!-- Server Loading State -->
+	<ServerLoadingSplash />
+{:else}
+	<div
+		aria-label="Welcome screen with file drop zone"
+		class="flex h-full items-center justify-center"
+		ondragenter={handleDragEnter}
+		ondragleave={handleDragLeave}
+		ondragover={handleDragOver}
+		ondrop={handleDrop}
+		role="main"
+	>
+		<div class="w-full max-w-[48rem] px-4">
+			<div class="mb-10 text-center" in:fade={{ duration: 300 }}>
+				<h1 class="mb-4 text-3xl font-semibold tracking-tight">llama.cpp</h1>
+
+				<p class="text-lg text-muted-foreground">
+					{serverStore.props?.modalities?.audio
+						? 'Record audio, type a message '
+						: 'Type a message'} or upload files to get started
+				</p>
+			</div>
+
+			{#if hasPropsError}
+				<div class="mb-4" in:fly={{ y: 10, duration: 250 }}>
+					<Alert.Root variant="destructive">
+						<AlertTriangle class="h-4 w-4" />
+						<Alert.Title class="flex items-center justify-between">
+							<span>Server unavailable</span>
+							<button
+								onclick={() => serverStore.fetch()}
+								disabled={isServerLoading}
+								class="flex items-center gap-1.5 rounded-lg bg-destructive/20 px-2 py-1 text-xs font-medium hover:bg-destructive/30 disabled:opacity-50"
+							>
+								<RefreshCw class="h-3 w-3 {isServerLoading ? 'animate-spin' : ''}" />
+								{isServerLoading ? 'Retrying...' : 'Retry'}
+							</button>
+						</Alert.Title>
+						<Alert.Description>{serverError()}</Alert.Description>
+					</Alert.Root>
+				</div>
+			{/if}
+
+			<div in:fly={{ y: 10, duration: 250, delay: hasPropsError ? 0 : 300 }}>
+				<ChatForm
+					disabled={hasPropsError}
+					isLoading={isCurrentConversationLoading}
+					onFileRemove={handleFileRemove}
+					onFileUpload={handleFileUpload}
+					onSend={handleSendMessage}
+					onStop={() => chatStore.stopGeneration()}
+					showHelperText={true}
+					bind:uploadedFiles
+				/>
+			</div>
+		</div>
+	</div>
+{/if}
+
+<!-- File Upload Error Alert Dialog -->
+<AlertDialog.Root bind:open={showFileErrorDialog}>
+	<AlertDialog.Portal>
+		<AlertDialog.Overlay />
+
+		<AlertDialog.Content class="flex max-w-md flex-col">
+			<AlertDialog.Header>
+				<AlertDialog.Title>File Upload Error</AlertDialog.Title>
+
+				<AlertDialog.Description class="text-sm text-muted-foreground">
+					Some files cannot be uploaded with the current model.
+				</AlertDialog.Description>
+			</AlertDialog.Header>
+
+			<div class="!max-h-[50vh] min-h-0 flex-1 space-y-4 overflow-y-auto">
+				{#if fileErrorData.generallyUnsupported.length > 0}
+					<div class="space-y-2">
+						<h4 class="text-sm font-medium text-destructive">Unsupported File Types</h4>
+
+						<div class="space-y-1">
+							{#each fileErrorData.generallyUnsupported as file (file.name)}
+								<div class="rounded-md bg-destructive/10 px-3 py-2">
+									<p class="font-mono text-sm break-all text-destructive">
+										{file.name}
+									</p>
+
+									<p class="mt-1 text-xs text-muted-foreground">File type not supported</p>
+								</div>
+							{/each}
+						</div>
+					</div>
+				{/if}
+
+				{#if fileErrorData.modalityUnsupported.length > 0}
+					<div class="space-y-2">
+						<div class="space-y-1">
+							{#each fileErrorData.modalityUnsupported as file (file.name)}
+								<div class="rounded-md bg-destructive/10 px-3 py-2">
+									<p class="font-mono text-sm break-all text-destructive">
+										{file.name}
+									</p>
+
+									<p class="mt-1 text-xs text-muted-foreground">
+										{fileErrorData.modalityReasons[file.name] || 'Not supported by current model'}
+									</p>
+								</div>
+							{/each}
+						</div>
+					</div>
+				{/if}
+			</div>
+
+			<div class="rounded-md bg-muted/50 p-3">
+				<h4 class="mb-2 text-sm font-medium">This model supports:</h4>
+
+				<p class="text-sm text-muted-foreground">
+					{fileErrorData.supportedTypes.join(', ')}
+				</p>
+			</div>
+
+			<AlertDialog.Footer>
+				<AlertDialog.Action onclick={() => (showFileErrorDialog = false)}>
+					Got it
+				</AlertDialog.Action>
+			</AlertDialog.Footer>
+		</AlertDialog.Content>
+	</AlertDialog.Portal>
+</AlertDialog.Root>
+
+<DialogConfirmation
+	bind:open={showDeleteDialog}
+	title="Delete Conversation"
+	description="Are you sure you want to delete this conversation? This action cannot be undone and will permanently remove all messages in this conversation."
+	confirmText="Delete"
+	cancelText="Cancel"
+	variant="destructive"
+	icon={Trash2}
+	onConfirm={handleDeleteConfirm}
+	onCancel={() => (showDeleteDialog = false)}
+/>
+
+<DialogEmptyFileAlert
+	bind:open={showEmptyFileDialog}
+	emptyFiles={emptyFileNames}
+	onOpenChange={(open) => {
+		if (!open) {
+			emptyFileNames = [];
+		}
+	}}
+/>
+
+<DialogChatError
+	message={activeErrorDialog?.message ?? ''}
+	contextInfo={activeErrorDialog?.contextInfo}
+	onOpenChange={handleErrorDialogOpenChange}
+	open={Boolean(activeErrorDialog)}
+	type={activeErrorDialog?.type ?? 'server'}
+/>
+
+<style>
+	.conversation-chat-form {
+		position: relative;
+
+		&::after {
+			content: '';
+			position: absolute;
+			bottom: 0;
+			z-index: -1;
+			left: 0;
+			right: 0;
+			width: 100%;
+			height: 2.375rem;
+			background-color: var(--background);
+		}
+	}
+</style>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenDragOverlay.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenDragOverlay.svelte
new file mode 100644
index 0000000..ab4adb2
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenDragOverlay.svelte
@@ -0,0 +1,17 @@
+<script>
+	import { Upload } from '@lucide/svelte';
+</script>
+
+<div
+	class="pointer-events-none fixed inset-0 z-50 flex items-center justify-center bg-black/50 backdrop-blur-sm"
+>
+	<div
+		class="flex flex-col items-center justify-center rounded-2xl border-2 border-dashed border-border bg-background p-12 shadow-lg"
+	>
+		<Upload class="mb-4 h-12 w-12 text-muted-foreground" />
+
+		<p class="text-lg font-medium text-foreground">Attach a file</p>
+
+		<p class="text-sm text-muted-foreground">Drop your files here to upload</p>
+	</div>
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte
new file mode 100644
index 0000000..874140f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte
@@ -0,0 +1,28 @@
+<script lang="ts">
+	import { Settings } from '@lucide/svelte';
+	import { DialogChatSettings } from '$lib/components/app';
+	import { Button } from '$lib/components/ui/button';
+	import { useSidebar } from '$lib/components/ui/sidebar';
+
+	const sidebar = useSidebar();
+
+	let settingsOpen = $state(false);
+
+	function toggleSettings() {
+		settingsOpen = true;
+	}
+</script>
+
+<header
+	class="md:background-transparent pointer-events-none fixed top-0 right-0 left-0 z-50 flex items-center justify-end bg-background/40 p-4 backdrop-blur-xl duration-200 ease-linear {sidebar.open
+		? 'md:left-[var(--sidebar-width)]'
+		: ''}"
+>
+	<div class="pointer-events-auto flex items-center space-x-2">
+		<Button variant="ghost" size="sm" onclick={toggleSettings}>
+			<Settings class="h-4 w-4" />
+		</Button>
+	</div>
+</header>
+
+<DialogChatSettings open={settingsOpen} onOpenChange={(open) => (settingsOpen = open)} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte
new file mode 100644
index 0000000..a60ae9e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte
@@ -0,0 +1,120 @@
+<script lang="ts">
+	import { untrack } from 'svelte';
+	import { PROCESSING_INFO_TIMEOUT } from '$lib/constants/processing-info';
+	import { useProcessingState } from '$lib/hooks/use-processing-state.svelte';
+	import { chatStore, isLoading, isChatStreaming } from '$lib/stores/chat.svelte';
+	import { activeMessages, activeConversation } from '$lib/stores/conversations.svelte';
+	import { config } from '$lib/stores/settings.svelte';
+
+	const processingState = useProcessingState();
+
+	let isCurrentConversationLoading = $derived(isLoading());
+	let isStreaming = $derived(isChatStreaming());
+	let hasProcessingData = $derived(processingState.processingState !== null);
+	let processingDetails = $derived(processingState.getProcessingDetails());
+
+	let showProcessingInfo = $derived(
+		isCurrentConversationLoading || isStreaming || config().keepStatsVisible || hasProcessingData
+	);
+
+	$effect(() => {
+		const conversation = activeConversation();
+
+		untrack(() => chatStore.setActiveProcessingConversation(conversation?.id ?? null));
+	});
+
+	$effect(() => {
+		const keepStatsVisible = config().keepStatsVisible;
+		const shouldMonitor = keepStatsVisible || isCurrentConversationLoading || isStreaming;
+
+		if (shouldMonitor) {
+			processingState.startMonitoring();
+		}
+
+		if (!isCurrentConversationLoading && !isStreaming && !keepStatsVisible) {
+			const timeout = setTimeout(() => {
+				if (!config().keepStatsVisible && !isChatStreaming()) {
+					processingState.stopMonitoring();
+				}
+			}, PROCESSING_INFO_TIMEOUT);
+
+			return () => clearTimeout(timeout);
+		}
+	});
+
+	$effect(() => {
+		const conversation = activeConversation();
+		const messages = activeMessages() as DatabaseMessage[];
+		const keepStatsVisible = config().keepStatsVisible;
+
+		if (keepStatsVisible && conversation) {
+			if (messages.length === 0) {
+				untrack(() => chatStore.clearProcessingState(conversation.id));
+				return;
+			}
+
+			if (!isCurrentConversationLoading && !isStreaming) {
+				untrack(() => chatStore.restoreProcessingStateFromMessages(messages, conversation.id));
+			}
+		}
+	});
+</script>
+
+<div class="chat-processing-info-container pointer-events-none" class:visible={showProcessingInfo}>
+	<div class="chat-processing-info-content">
+		{#each processingDetails as detail (detail)}
+			<span class="chat-processing-info-detail pointer-events-auto">{detail}</span>
+		{/each}
+	</div>
+</div>
+
+<style>
+	.chat-processing-info-container {
+		position: sticky;
+		top: 0;
+		z-index: 10;
+		padding: 1.5rem 1rem;
+		opacity: 0;
+		transform: translateY(50%);
+		transition:
+			opacity 300ms ease-out,
+			transform 300ms ease-out;
+	}
+
+	.chat-processing-info-container.visible {
+		opacity: 1;
+		transform: translateY(0);
+	}
+
+	.chat-processing-info-content {
+		display: flex;
+		flex-wrap: wrap;
+		align-items: center;
+		gap: 1rem;
+		justify-content: center;
+		max-width: 48rem;
+		margin: 0 auto;
+	}
+
+	.chat-processing-info-detail {
+		color: var(--muted-foreground);
+		font-size: 0.75rem;
+		padding: 0.25rem 0.75rem;
+		background: var(--muted);
+		border-radius: 0.375rem;
+		font-family:
+			ui-monospace, SFMono-Regular, 'SF Mono', Consolas, 'Liberation Mono', Menlo, monospace;
+		white-space: nowrap;
+	}
+
+	@media (max-width: 768px) {
+		.chat-processing-info-content {
+			gap: 0.5rem;
+		}
+
+		.chat-processing-info-detail {
+			font-size: 0.7rem;
+			padding: 0.2rem 0.5rem;
+		}
+	}
+</style>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
new file mode 100644
index 0000000..5a668aa
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte
@@ -0,0 +1,508 @@
+<script lang="ts">
+	import {
+		Settings,
+		Funnel,
+		AlertTriangle,
+		Code,
+		Monitor,
+		Sun,
+		Moon,
+		ChevronLeft,
+		ChevronRight,
+		Database
+	} from '@lucide/svelte';
+	import {
+		ChatSettingsFooter,
+		ChatSettingsImportExportTab,
+		ChatSettingsFields
+	} from '$lib/components/app';
+	import { ScrollArea } from '$lib/components/ui/scroll-area';
+	import { config, settingsStore } from '$lib/stores/settings.svelte';
+	import { setMode } from 'mode-watcher';
+	import type { Component } from 'svelte';
+
+	interface Props {
+		onSave?: () => void;
+	}
+
+	let { onSave }: Props = $props();
+
+	const settingSections: Array<{
+		fields: SettingsFieldConfig[];
+		icon: Component;
+		title: string;
+	}> = [
+		{
+			title: 'General',
+			icon: Settings,
+			fields: [
+				{
+					key: 'theme',
+					label: 'Theme',
+					type: 'select',
+					options: [
+						{ value: 'system', label: 'System', icon: Monitor },
+						{ value: 'light', label: 'Light', icon: Sun },
+						{ value: 'dark', label: 'Dark', icon: Moon }
+					]
+				},
+				{ key: 'apiKey', label: 'API Key', type: 'input' },
+				{
+					key: 'systemMessage',
+					label: 'System Message',
+					type: 'textarea'
+				},
+				{
+					key: 'pasteLongTextToFileLen',
+					label: 'Paste long text to file length',
+					type: 'input'
+				},
+				{
+					key: 'copyTextAttachmentsAsPlainText',
+					label: 'Copy text attachments as plain text',
+					type: 'checkbox'
+				},
+				{
+					key: 'enableContinueGeneration',
+					label: 'Enable "Continue" button',
+					type: 'checkbox',
+					isExperimental: true
+				},
+				{
+					key: 'pdfAsImage',
+					label: 'Parse PDF as image',
+					type: 'checkbox'
+				},
+				{
+					key: 'askForTitleConfirmation',
+					label: 'Ask for confirmation before changing conversation title',
+					type: 'checkbox'
+				}
+			]
+		},
+		{
+			title: 'Display',
+			icon: Monitor,
+			fields: [
+				{
+					key: 'showMessageStats',
+					label: 'Show message generation statistics',
+					type: 'checkbox'
+				},
+				{
+					key: 'showThoughtInProgress',
+					label: 'Show thought in progress',
+					type: 'checkbox'
+				},
+				{
+					key: 'keepStatsVisible',
+					label: 'Keep stats visible after generation',
+					type: 'checkbox'
+				},
+				{
+					key: 'autoMicOnEmpty',
+					label: 'Show microphone on empty input',
+					type: 'checkbox',
+					isExperimental: true
+				},
+				{
+					key: 'renderUserContentAsMarkdown',
+					label: 'Render user content as Markdown',
+					type: 'checkbox'
+				},
+				{
+					key: 'disableAutoScroll',
+					label: 'Disable automatic scroll',
+					type: 'checkbox'
+				},
+				{
+					key: 'alwaysShowSidebarOnDesktop',
+					label: 'Always show sidebar on desktop',
+					type: 'checkbox'
+				},
+				{
+					key: 'autoShowSidebarOnNewChat',
+					label: 'Auto-show sidebar on new chat',
+					type: 'checkbox'
+				}
+			]
+		},
+		{
+			title: 'Sampling',
+			icon: Funnel,
+			fields: [
+				{
+					key: 'temperature',
+					label: 'Temperature',
+					type: 'input'
+				},
+				{
+					key: 'dynatemp_range',
+					label: 'Dynamic temperature range',
+					type: 'input'
+				},
+				{
+					key: 'dynatemp_exponent',
+					label: 'Dynamic temperature exponent',
+					type: 'input'
+				},
+				{
+					key: 'top_k',
+					label: 'Top K',
+					type: 'input'
+				},
+				{
+					key: 'top_p',
+					label: 'Top P',
+					type: 'input'
+				},
+				{
+					key: 'min_p',
+					label: 'Min P',
+					type: 'input'
+				},
+				{
+					key: 'xtc_probability',
+					label: 'XTC probability',
+					type: 'input'
+				},
+				{
+					key: 'xtc_threshold',
+					label: 'XTC threshold',
+					type: 'input'
+				},
+				{
+					key: 'typ_p',
+					label: 'Typical P',
+					type: 'input'
+				},
+				{
+					key: 'max_tokens',
+					label: 'Max tokens',
+					type: 'input'
+				},
+				{
+					key: 'samplers',
+					label: 'Samplers',
+					type: 'input'
+				},
+				{
+					key: 'backend_sampling',
+					label: 'Backend sampling',
+					type: 'checkbox'
+				}
+			]
+		},
+		{
+			title: 'Penalties',
+			icon: AlertTriangle,
+			fields: [
+				{
+					key: 'repeat_last_n',
+					label: 'Repeat last N',
+					type: 'input'
+				},
+				{
+					key: 'repeat_penalty',
+					label: 'Repeat penalty',
+					type: 'input'
+				},
+				{
+					key: 'presence_penalty',
+					label: 'Presence penalty',
+					type: 'input'
+				},
+				{
+					key: 'frequency_penalty',
+					label: 'Frequency penalty',
+					type: 'input'
+				},
+				{
+					key: 'dry_multiplier',
+					label: 'DRY multiplier',
+					type: 'input'
+				},
+				{
+					key: 'dry_base',
+					label: 'DRY base',
+					type: 'input'
+				},
+				{
+					key: 'dry_allowed_length',
+					label: 'DRY allowed length',
+					type: 'input'
+				},
+				{
+					key: 'dry_penalty_last_n',
+					label: 'DRY penalty last N',
+					type: 'input'
+				}
+			]
+		},
+		{
+			title: 'Import/Export',
+			icon: Database,
+			fields: []
+		},
+		{
+			title: 'Developer',
+			icon: Code,
+			fields: [
+				{
+					key: 'showToolCalls',
+					label: 'Show tool call labels',
+					type: 'checkbox'
+				},
+				{
+					key: 'disableReasoningFormat',
+					label: 'Show raw LLM output',
+					type: 'checkbox'
+				},
+				{
+					key: 'custom',
+					label: 'Custom JSON',
+					type: 'textarea'
+				}
+			]
+		}
+		// TODO: Experimental features section will be implemented after initial release
+		// This includes Python interpreter (Pyodide integration) and other experimental features
+		// {
+		// 	title: 'Experimental',
+		// 	icon: Beaker,
+		// 	fields: [
+		// 		{
+		// 			key: 'pyInterpreterEnabled',
+		// 			label: 'Enable Python interpreter',
+		// 			type: 'checkbox'
+		// 		}
+		// 	]
+		// }
+	];
+
+	let activeSection = $state('General');
+	let currentSection = $derived(
+		settingSections.find((section) => section.title === activeSection) || settingSections[0]
+	);
+	let localConfig: SettingsConfigType = $state({ ...config() });
+
+	let canScrollLeft = $state(false);
+	let canScrollRight = $state(false);
+	let scrollContainer: HTMLDivElement | undefined = $state();
+
+	function handleThemeChange(newTheme: string) {
+		localConfig.theme = newTheme;
+
+		setMode(newTheme as 'light' | 'dark' | 'system');
+	}
+
+	function handleConfigChange(key: string, value: string | boolean) {
+		localConfig[key] = value;
+	}
+
+	function handleReset() {
+		localConfig = { ...config() };
+
+		setMode(localConfig.theme as 'light' | 'dark' | 'system');
+	}
+
+	function handleSave() {
+		if (localConfig.custom && typeof localConfig.custom === 'string' && localConfig.custom.trim()) {
+			try {
+				JSON.parse(localConfig.custom);
+			} catch (error) {
+				alert('Invalid JSON in custom parameters. Please check the format and try again.');
+				console.error(error);
+				return;
+			}
+		}
+
+		// Convert numeric strings to numbers for numeric fields
+		const processedConfig = { ...localConfig };
+		const numericFields = [
+			'temperature',
+			'top_k',
+			'top_p',
+			'min_p',
+			'max_tokens',
+			'pasteLongTextToFileLen',
+			'dynatemp_range',
+			'dynatemp_exponent',
+			'typ_p',
+			'xtc_probability',
+			'xtc_threshold',
+			'repeat_last_n',
+			'repeat_penalty',
+			'presence_penalty',
+			'frequency_penalty',
+			'dry_multiplier',
+			'dry_base',
+			'dry_allowed_length',
+			'dry_penalty_last_n'
+		];
+
+		for (const field of numericFields) {
+			if (processedConfig[field] !== undefined && processedConfig[field] !== '') {
+				const numValue = Number(processedConfig[field]);
+				if (!isNaN(numValue)) {
+					processedConfig[field] = numValue;
+				} else {
+					alert(`Invalid numeric value for ${field}. Please enter a valid number.`);
+					return;
+				}
+			}
+		}
+
+		settingsStore.updateMultipleConfig(processedConfig);
+		onSave?.();
+	}
+
+	function scrollToCenter(element: HTMLElement) {
+		if (!scrollContainer) return;
+
+		const containerRect = scrollContainer.getBoundingClientRect();
+		const elementRect = element.getBoundingClientRect();
+
+		const elementCenter = elementRect.left + elementRect.width / 2;
+		const containerCenter = containerRect.left + containerRect.width / 2;
+		const scrollOffset = elementCenter - containerCenter;
+
+		scrollContainer.scrollBy({ left: scrollOffset, behavior: 'smooth' });
+	}
+
+	function scrollLeft() {
+		if (!scrollContainer) return;
+
+		scrollContainer.scrollBy({ left: -250, behavior: 'smooth' });
+	}
+
+	function scrollRight() {
+		if (!scrollContainer) return;
+
+		scrollContainer.scrollBy({ left: 250, behavior: 'smooth' });
+	}
+
+	function updateScrollButtons() {
+		if (!scrollContainer) return;
+
+		const { scrollLeft, scrollWidth, clientWidth } = scrollContainer;
+		canScrollLeft = scrollLeft > 0;
+		canScrollRight = scrollLeft < scrollWidth - clientWidth - 1; // -1 for rounding
+	}
+
+	export function reset() {
+		localConfig = { ...config() };
+
+		setTimeout(updateScrollButtons, 100);
+	}
+
+	$effect(() => {
+		if (scrollContainer) {
+			updateScrollButtons();
+		}
+	});
+</script>
+
+<div class="flex h-full flex-col overflow-hidden md:flex-row">
+	<!-- Desktop Sidebar -->
+	<div class="hidden w-64 border-r border-border/30 p-6 md:block">
+		<nav class="space-y-1 py-2">
+			{#each settingSections as section (section.title)}
+				<button
+					class="flex w-full cursor-pointer items-center gap-3 rounded-lg px-3 py-2 text-left text-sm transition-colors hover:bg-accent {activeSection ===
+					section.title
+						? 'bg-accent text-accent-foreground'
+						: 'text-muted-foreground'}"
+					onclick={() => (activeSection = section.title)}
+				>
+					<section.icon class="h-4 w-4" />
+
+					<span class="ml-2">{section.title}</span>
+				</button>
+			{/each}
+		</nav>
+	</div>
+
+	<!-- Mobile Header with Horizontal Scrollable Menu -->
+	<div class="flex flex-col pt-6 md:hidden">
+		<div class="border-b border-border/30 py-4">
+			<!-- Horizontal Scrollable Category Menu with Navigation -->
+			<div class="relative flex items-center" style="scroll-padding: 1rem;">
+				<button
+					class="absolute left-2 z-10 flex h-6 w-6 items-center justify-center rounded-full bg-muted shadow-md backdrop-blur-sm transition-opacity hover:bg-accent {canScrollLeft
+						? 'opacity-100'
+						: 'pointer-events-none opacity-0'}"
+					onclick={scrollLeft}
+					aria-label="Scroll left"
+				>
+					<ChevronLeft class="h-4 w-4" />
+				</button>
+
+				<div
+					class="scrollbar-hide overflow-x-auto py-2"
+					bind:this={scrollContainer}
+					onscroll={updateScrollButtons}
+				>
+					<div class="flex min-w-max gap-2">
+						{#each settingSections as section (section.title)}
+							<button
+								class="flex cursor-pointer items-center gap-2 rounded-lg px-3 py-2 text-sm whitespace-nowrap transition-colors first:ml-4 last:mr-4 hover:bg-accent {activeSection ===
+								section.title
+									? 'bg-accent text-accent-foreground'
+									: 'text-muted-foreground'}"
+								onclick={(e: MouseEvent) => {
+									activeSection = section.title;
+									scrollToCenter(e.currentTarget as HTMLElement);
+								}}
+							>
+								<section.icon class="h-4 w-4 flex-shrink-0" />
+								<span>{section.title}</span>
+							</button>
+						{/each}
+					</div>
+				</div>
+
+				<button
+					class="absolute right-2 z-10 flex h-6 w-6 items-center justify-center rounded-full bg-muted shadow-md backdrop-blur-sm transition-opacity hover:bg-accent {canScrollRight
+						? 'opacity-100'
+						: 'pointer-events-none opacity-0'}"
+					onclick={scrollRight}
+					aria-label="Scroll right"
+				>
+					<ChevronRight class="h-4 w-4" />
+				</button>
+			</div>
+		</div>
+	</div>
+
+	<ScrollArea class="max-h-[calc(100dvh-13.5rem)] flex-1 md:max-h-[calc(100vh-13.5rem)]">
+		<div class="space-y-6 p-4 md:p-6">
+			<div class="grid">
+				<div class="mb-6 flex hidden items-center gap-2 border-b border-border/30 pb-6 md:flex">
+					<currentSection.icon class="h-5 w-5" />
+
+					<h3 class="text-lg font-semibold">{currentSection.title}</h3>
+				</div>
+
+				{#if currentSection.title === 'Import/Export'}
+					<ChatSettingsImportExportTab />
+				{:else}
+					<div class="space-y-6">
+						<ChatSettingsFields
+							fields={currentSection.fields}
+							{localConfig}
+							onConfigChange={handleConfigChange}
+							onThemeChange={handleThemeChange}
+						/>
+					</div>
+				{/if}
+			</div>
+
+			<div class="mt-8 border-t pt-6">
+				<p class="text-xs text-muted-foreground">Settings are saved in browser's localStorage</p>
+			</div>
+		</div>
+	</ScrollArea>
+</div>
+
+<ChatSettingsFooter onReset={handleReset} onSave={handleSave} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte
new file mode 100644
index 0000000..a6f51f4
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte
@@ -0,0 +1,255 @@
+<script lang="ts">
+	import { RotateCcw, FlaskConical } from '@lucide/svelte';
+	import { Checkbox } from '$lib/components/ui/checkbox';
+	import { Input } from '$lib/components/ui/input';
+	import Label from '$lib/components/ui/label/label.svelte';
+	import * as Select from '$lib/components/ui/select';
+	import { Textarea } from '$lib/components/ui/textarea';
+	import { SETTING_CONFIG_DEFAULT, SETTING_CONFIG_INFO } from '$lib/constants/settings-config';
+	import { settingsStore } from '$lib/stores/settings.svelte';
+	import { ChatSettingsParameterSourceIndicator } from '$lib/components/app';
+	import type { Component } from 'svelte';
+
+	interface Props {
+		fields: SettingsFieldConfig[];
+		localConfig: SettingsConfigType;
+		onConfigChange: (key: string, value: string | boolean) => void;
+		onThemeChange?: (theme: string) => void;
+	}
+
+	let { fields, localConfig, onConfigChange, onThemeChange }: Props = $props();
+
+	// Helper function to get parameter source info for syncable parameters
+	function getParameterSourceInfo(key: string) {
+		if (!settingsStore.canSyncParameter(key)) {
+			return null;
+		}
+
+		return settingsStore.getParameterInfo(key);
+	}
+</script>
+
+{#each fields as field (field.key)}
+	<div class="space-y-2">
+		{#if field.type === 'input'}
+			{@const paramInfo = getParameterSourceInfo(field.key)}
+			{@const currentValue = String(localConfig[field.key] ?? '')}
+			{@const propsDefault = paramInfo?.serverDefault}
+			{@const isCustomRealTime = (() => {
+				if (!paramInfo || propsDefault === undefined) return false;
+
+				// Apply same rounding logic for real-time comparison
+				const inputValue = currentValue;
+				const numericInput = parseFloat(inputValue);
+				const normalizedInput = !isNaN(numericInput)
+					? Math.round(numericInput * 1000000) / 1000000
+					: inputValue;
+				const normalizedDefault =
+					typeof propsDefault === 'number'
+						? Math.round(propsDefault * 1000000) / 1000000
+						: propsDefault;
+
+				return normalizedInput !== normalizedDefault;
+			})()}
+
+			<div class="flex items-center gap-2">
+				<Label for={field.key} class="flex items-center gap-1.5 text-sm font-medium">
+					{field.label}
+
+					{#if field.isExperimental}
+						<FlaskConical class="h-3.5 w-3.5 text-muted-foreground" />
+					{/if}
+				</Label>
+				{#if isCustomRealTime}
+					<ChatSettingsParameterSourceIndicator />
+				{/if}
+			</div>
+
+			<div class="relative w-full md:max-w-md">
+				<Input
+					id={field.key}
+					value={currentValue}
+					oninput={(e) => {
+						// Update local config immediately for real-time badge feedback
+						onConfigChange(field.key, e.currentTarget.value);
+					}}
+					placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
+					class="w-full {isCustomRealTime ? 'pr-8' : ''}"
+				/>
+				{#if isCustomRealTime}
+					<button
+						type="button"
+						onclick={() => {
+							settingsStore.resetParameterToServerDefault(field.key);
+							// Trigger UI update by calling onConfigChange with the default value
+							const defaultValue = propsDefault ?? SETTING_CONFIG_DEFAULT[field.key];
+							onConfigChange(field.key, String(defaultValue));
+						}}
+						class="absolute top-1/2 right-2 inline-flex h-5 w-5 -translate-y-1/2 items-center justify-center rounded transition-colors hover:bg-muted"
+						aria-label="Reset to default"
+						title="Reset to default"
+					>
+						<RotateCcw class="h-3 w-3" />
+					</button>
+				{/if}
+			</div>
+			{#if field.help || SETTING_CONFIG_INFO[field.key]}
+				<p class="mt-1 text-xs text-muted-foreground">
+					{@html field.help || SETTING_CONFIG_INFO[field.key]}
+				</p>
+			{/if}
+		{:else if field.type === 'textarea'}
+			<Label for={field.key} class="block flex items-center gap-1.5 text-sm font-medium">
+				{field.label}
+
+				{#if field.isExperimental}
+					<FlaskConical class="h-3.5 w-3.5 text-muted-foreground" />
+				{/if}
+			</Label>
+
+			<Textarea
+				id={field.key}
+				value={String(localConfig[field.key] ?? '')}
+				onchange={(e) => onConfigChange(field.key, e.currentTarget.value)}
+				placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
+				class="min-h-[10rem] w-full md:max-w-2xl"
+			/>
+
+			{#if field.help || SETTING_CONFIG_INFO[field.key]}
+				<p class="mt-1 text-xs text-muted-foreground">
+					{field.help || SETTING_CONFIG_INFO[field.key]}
+				</p>
+			{/if}
+
+			{#if field.key === 'systemMessage'}
+				<div class="mt-3 flex items-center gap-2">
+					<Checkbox
+						id="showSystemMessage"
+						checked={Boolean(localConfig.showSystemMessage ?? true)}
+						onCheckedChange={(checked) => onConfigChange('showSystemMessage', Boolean(checked))}
+					/>
+
+					<Label for="showSystemMessage" class="cursor-pointer text-sm font-normal">
+						Show system message in conversations
+					</Label>
+				</div>
+			{/if}
+		{:else if field.type === 'select'}
+			{@const selectedOption = field.options?.find(
+				(opt: { value: string; label: string; icon?: Component }) =>
+					opt.value === localConfig[field.key]
+			)}
+			{@const paramInfo = getParameterSourceInfo(field.key)}
+			{@const currentValue = localConfig[field.key]}
+			{@const propsDefault = paramInfo?.serverDefault}
+			{@const isCustomRealTime = (() => {
+				if (!paramInfo || propsDefault === undefined) return false;
+
+				// For select fields, do direct comparison (no rounding needed)
+				return currentValue !== propsDefault;
+			})()}
+
+			<div class="flex items-center gap-2">
+				<Label for={field.key} class="flex items-center gap-1.5 text-sm font-medium">
+					{field.label}
+
+					{#if field.isExperimental}
+						<FlaskConical class="h-3.5 w-3.5 text-muted-foreground" />
+					{/if}
+				</Label>
+				{#if isCustomRealTime}
+					<ChatSettingsParameterSourceIndicator />
+				{/if}
+			</div>
+
+			<Select.Root
+				type="single"
+				value={currentValue}
+				onValueChange={(value) => {
+					if (field.key === 'theme' && value && onThemeChange) {
+						onThemeChange(value);
+					} else {
+						onConfigChange(field.key, value);
+					}
+				}}
+			>
+				<div class="relative w-full md:w-auto md:max-w-md">
+					<Select.Trigger class="w-full">
+						<div class="flex items-center gap-2">
+							{#if selectedOption?.icon}
+								{@const IconComponent = selectedOption.icon}
+								<IconComponent class="h-4 w-4" />
+							{/if}
+
+							{selectedOption?.label || `Select ${field.label.toLowerCase()}`}
+						</div>
+					</Select.Trigger>
+					{#if isCustomRealTime}
+						<button
+							type="button"
+							onclick={() => {
+								settingsStore.resetParameterToServerDefault(field.key);
+								// Trigger UI update by calling onConfigChange with the default value
+								const defaultValue = propsDefault ?? SETTING_CONFIG_DEFAULT[field.key];
+								onConfigChange(field.key, String(defaultValue));
+							}}
+							class="absolute top-1/2 right-8 inline-flex h-5 w-5 -translate-y-1/2 items-center justify-center rounded transition-colors hover:bg-muted"
+							aria-label="Reset to default"
+							title="Reset to default"
+						>
+							<RotateCcw class="h-3 w-3" />
+						</button>
+					{/if}
+				</div>
+				<Select.Content>
+					{#if field.options}
+						{#each field.options as option (option.value)}
+							<Select.Item value={option.value} label={option.label}>
+								<div class="flex items-center gap-2">
+									{#if option.icon}
+										{@const IconComponent = option.icon}
+										<IconComponent class="h-4 w-4" />
+									{/if}
+									{option.label}
+								</div>
+							</Select.Item>
+						{/each}
+					{/if}
+				</Select.Content>
+			</Select.Root>
+			{#if field.help || SETTING_CONFIG_INFO[field.key]}
+				<p class="mt-1 text-xs text-muted-foreground">
+					{field.help || SETTING_CONFIG_INFO[field.key]}
+				</p>
+			{/if}
+		{:else if field.type === 'checkbox'}
+			<div class="flex items-start space-x-3">
+				<Checkbox
+					id={field.key}
+					checked={Boolean(localConfig[field.key])}
+					onCheckedChange={(checked) => onConfigChange(field.key, checked)}
+					class="mt-1"
+				/>
+
+				<div class="space-y-1">
+					<label
+						for={field.key}
+						class="flex cursor-pointer items-center gap-1.5 pt-1 pb-0.5 text-sm leading-none font-medium"
+					>
+						{field.label}
+
+						{#if field.isExperimental}
+							<FlaskConical class="h-3.5 w-3.5 text-muted-foreground" />
+						{/if}
+					</label>
+
+					{#if field.help || SETTING_CONFIG_INFO[field.key]}
+						<p class="text-xs text-muted-foreground">
+							{field.help || SETTING_CONFIG_INFO[field.key]}
+						</p>
+					{/if}
+				</div>
+			</div>
+		{/if}
+	</div>
+{/each}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFooter.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFooter.svelte
new file mode 100644
index 0000000..1f7eb4e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFooter.svelte
@@ -0,0 +1,59 @@
+<script lang="ts">
+	import { Button } from '$lib/components/ui/button';
+	import * as AlertDialog from '$lib/components/ui/alert-dialog';
+	import { settingsStore } from '$lib/stores/settings.svelte';
+	import { RotateCcw } from '@lucide/svelte';
+
+	interface Props {
+		onReset?: () => void;
+		onSave?: () => void;
+	}
+
+	let { onReset, onSave }: Props = $props();
+
+	let showResetDialog = $state(false);
+
+	function handleResetClick() {
+		showResetDialog = true;
+	}
+
+	function handleConfirmReset() {
+		settingsStore.forceSyncWithServerDefaults();
+		onReset?.();
+
+		showResetDialog = false;
+	}
+
+	function handleSave() {
+		onSave?.();
+	}
+</script>
+
+<div class="flex justify-between border-t border-border/30 p-6">
+	<div class="flex gap-2">
+		<Button variant="outline" onclick={handleResetClick}>
+			<RotateCcw class="h-3 w-3" />
+
+			Reset to default
+		</Button>
+	</div>
+
+	<Button onclick={handleSave}>Save settings</Button>
+</div>
+
+<AlertDialog.Root bind:open={showResetDialog}>
+	<AlertDialog.Content>
+		<AlertDialog.Header>
+			<AlertDialog.Title>Reset Settings to Default</AlertDialog.Title>
+			<AlertDialog.Description>
+				Are you sure you want to reset all settings to their default values? This will reset all
+				parameters to the values provided by the server's /props endpoint and remove all your custom
+				configurations.
+			</AlertDialog.Description>
+		</AlertDialog.Header>
+		<AlertDialog.Footer>
+			<AlertDialog.Cancel>Cancel</AlertDialog.Cancel>
+			<AlertDialog.Action onclick={handleConfirmReset}>Reset to Default</AlertDialog.Action>
+		</AlertDialog.Footer>
+	</AlertDialog.Content>
+</AlertDialog.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte
new file mode 100644
index 0000000..1c8b411
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte
@@ -0,0 +1,317 @@
+<script lang="ts">
+	import { Download, Upload, Trash2 } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+	import { DialogConversationSelection } from '$lib/components/app';
+	import { createMessageCountMap } from '$lib/utils';
+	import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
+	import { toast } from 'svelte-sonner';
+	import DialogConfirmation from '$lib/components/app/dialogs/DialogConfirmation.svelte';
+
+	let exportedConversations = $state<DatabaseConversation[]>([]);
+	let importedConversations = $state<DatabaseConversation[]>([]);
+	let showExportSummary = $state(false);
+	let showImportSummary = $state(false);
+
+	let showExportDialog = $state(false);
+	let showImportDialog = $state(false);
+	let availableConversations = $state<DatabaseConversation[]>([]);
+	let messageCountMap = $state<Map<string, number>>(new Map());
+	let fullImportData = $state<Array<{ conv: DatabaseConversation; messages: DatabaseMessage[] }>>(
+		[]
+	);
+
+	// Delete functionality state
+	let showDeleteDialog = $state(false);
+
+	async function handleExportClick() {
+		try {
+			const allConversations = conversations();
+			if (allConversations.length === 0) {
+				toast.info('No conversations to export');
+				return;
+			}
+
+			const conversationsWithMessages = await Promise.all(
+				allConversations.map(async (conv: DatabaseConversation) => {
+					const messages = await conversationsStore.getConversationMessages(conv.id);
+					return { conv, messages };
+				})
+			);
+
+			messageCountMap = createMessageCountMap(conversationsWithMessages);
+			availableConversations = allConversations;
+			showExportDialog = true;
+		} catch (err) {
+			console.error('Failed to load conversations:', err);
+			alert('Failed to load conversations');
+		}
+	}
+
+	async function handleExportConfirm(selectedConversations: DatabaseConversation[]) {
+		try {
+			const allData: ExportedConversations = await Promise.all(
+				selectedConversations.map(async (conv) => {
+					const messages = await conversationsStore.getConversationMessages(conv.id);
+					return { conv: $state.snapshot(conv), messages: $state.snapshot(messages) };
+				})
+			);
+
+			const blob = new Blob([JSON.stringify(allData, null, 2)], {
+				type: 'application/json'
+			});
+			const url = URL.createObjectURL(blob);
+			const a = document.createElement('a');
+
+			a.href = url;
+			a.download = `conversations_${new Date().toISOString().split('T')[0]}.json`;
+			document.body.appendChild(a);
+			a.click();
+			document.body.removeChild(a);
+			URL.revokeObjectURL(url);
+
+			exportedConversations = selectedConversations;
+			showExportSummary = true;
+			showImportSummary = false;
+			showExportDialog = false;
+		} catch (err) {
+			console.error('Export failed:', err);
+			alert('Failed to export conversations');
+		}
+	}
+
+	async function handleImportClick() {
+		try {
+			const input = document.createElement('input');
+
+			input.type = 'file';
+			input.accept = '.json';
+
+			input.onchange = async (e) => {
+				const file = (e.target as HTMLInputElement)?.files?.[0];
+				if (!file) return;
+
+				try {
+					const text = await file.text();
+					const parsedData = JSON.parse(text);
+					let importedData: ExportedConversations;
+
+					if (Array.isArray(parsedData)) {
+						importedData = parsedData;
+					} else if (
+						parsedData &&
+						typeof parsedData === 'object' &&
+						'conv' in parsedData &&
+						'messages' in parsedData
+					) {
+						// Single conversation object
+						importedData = [parsedData];
+					} else {
+						throw new Error(
+							'Invalid file format: expected array of conversations or single conversation object'
+						);
+					}
+
+					fullImportData = importedData;
+					availableConversations = importedData.map(
+						(item: { conv: DatabaseConversation; messages: DatabaseMessage[] }) => item.conv
+					);
+					messageCountMap = createMessageCountMap(importedData);
+					showImportDialog = true;
+				} catch (err: unknown) {
+					const message = err instanceof Error ? err.message : 'Unknown error';
+
+					console.error('Failed to parse file:', err);
+					alert(`Failed to parse file: ${message}`);
+				}
+			};
+
+			input.click();
+		} catch (err) {
+			console.error('Import failed:', err);
+			alert('Failed to import conversations');
+		}
+	}
+
+	async function handleImportConfirm(selectedConversations: DatabaseConversation[]) {
+		try {
+			const selectedIds = new Set(selectedConversations.map((c) => c.id));
+			const selectedData = $state
+				.snapshot(fullImportData)
+				.filter((item) => selectedIds.has(item.conv.id));
+
+			await conversationsStore.importConversationsData(selectedData);
+
+			importedConversations = selectedConversations;
+			showImportSummary = true;
+			showExportSummary = false;
+			showImportDialog = false;
+		} catch (err) {
+			console.error('Import failed:', err);
+			alert('Failed to import conversations. Please check the file format.');
+		}
+	}
+
+	async function handleDeleteAllClick() {
+		try {
+			const allConversations = conversations();
+
+			if (allConversations.length === 0) {
+				toast.info('No conversations to delete');
+				return;
+			}
+
+			showDeleteDialog = true;
+		} catch (err) {
+			console.error('Failed to load conversations for deletion:', err);
+			toast.error('Failed to load conversations');
+		}
+	}
+
+	async function handleDeleteAllConfirm() {
+		try {
+			await conversationsStore.deleteAll();
+
+			showDeleteDialog = false;
+		} catch (err) {
+			console.error('Failed to delete conversations:', err);
+		}
+	}
+
+	function handleDeleteAllCancel() {
+		showDeleteDialog = false;
+	}
+</script>
+
+<div class="space-y-6">
+	<div class="space-y-4">
+		<div class="grid">
+			<h4 class="mb-2 text-sm font-medium">Export Conversations</h4>
+
+			<p class="mb-4 text-sm text-muted-foreground">
+				Download all your conversations as a JSON file. This includes all messages, attachments, and
+				conversation history.
+			</p>
+
+			<Button
+				class="w-full justify-start justify-self-start md:w-auto"
+				onclick={handleExportClick}
+				variant="outline"
+			>
+				<Download class="mr-2 h-4 w-4" />
+
+				Export conversations
+			</Button>
+
+			{#if showExportSummary && exportedConversations.length > 0}
+				<div class="mt-4 grid overflow-x-auto rounded-lg border border-border/50 bg-muted/30 p-4">
+					<h5 class="mb-2 text-sm font-medium">
+						Exported {exportedConversations.length} conversation{exportedConversations.length === 1
+							? ''
+							: 's'}
+					</h5>
+
+					<ul class="space-y-1 text-sm text-muted-foreground">
+						{#each exportedConversations.slice(0, 10) as conv (conv.id)}
+							<li class="truncate">• {conv.name || 'Untitled conversation'}</li>
+						{/each}
+
+						{#if exportedConversations.length > 10}
+							<li class="italic">
+								... and {exportedConversations.length - 10} more
+							</li>
+						{/if}
+					</ul>
+				</div>
+			{/if}
+		</div>
+
+		<div class="grid border-t border-border/30 pt-4">
+			<h4 class="mb-2 text-sm font-medium">Import Conversations</h4>
+
+			<p class="mb-4 text-sm text-muted-foreground">
+				Import one or more conversations from a previously exported JSON file. This will merge with
+				your existing conversations.
+			</p>
+
+			<Button
+				class="w-full justify-start justify-self-start md:w-auto"
+				onclick={handleImportClick}
+				variant="outline"
+			>
+				<Upload class="mr-2 h-4 w-4" />
+				Import conversations
+			</Button>
+
+			{#if showImportSummary && importedConversations.length > 0}
+				<div class="mt-4 grid overflow-x-auto rounded-lg border border-border/50 bg-muted/30 p-4">
+					<h5 class="mb-2 text-sm font-medium">
+						Imported {importedConversations.length} conversation{importedConversations.length === 1
+							? ''
+							: 's'}
+					</h5>
+
+					<ul class="space-y-1 text-sm text-muted-foreground">
+						{#each importedConversations.slice(0, 10) as conv (conv.id)}
+							<li class="truncate">• {conv.name || 'Untitled conversation'}</li>
+						{/each}
+
+						{#if importedConversations.length > 10}
+							<li class="italic">
+								... and {importedConversations.length - 10} more
+							</li>
+						{/if}
+					</ul>
+				</div>
+			{/if}
+		</div>
+
+		<div class="grid border-t border-border/30 pt-4">
+			<h4 class="mb-2 text-sm font-medium text-destructive">Delete All Conversations</h4>
+
+			<p class="mb-4 text-sm text-muted-foreground">
+				Permanently delete all conversations and their messages. This action cannot be undone.
+				Consider exporting your conversations first if you want to keep a backup.
+			</p>
+
+			<Button
+				class="text-destructive-foreground w-full justify-start justify-self-start bg-destructive hover:bg-destructive/80 md:w-auto"
+				onclick={handleDeleteAllClick}
+				variant="destructive"
+			>
+				<Trash2 class="mr-2 h-4 w-4" />
+
+				Delete all conversations
+			</Button>
+		</div>
+	</div>
+</div>
+
+<DialogConversationSelection
+	conversations={availableConversations}
+	{messageCountMap}
+	mode="export"
+	bind:open={showExportDialog}
+	onCancel={() => (showExportDialog = false)}
+	onConfirm={handleExportConfirm}
+/>
+
+<DialogConversationSelection
+	conversations={availableConversations}
+	{messageCountMap}
+	mode="import"
+	bind:open={showImportDialog}
+	onCancel={() => (showImportDialog = false)}
+	onConfirm={handleImportConfirm}
+/>
+
+<DialogConfirmation
+	bind:open={showDeleteDialog}
+	title="Delete all conversations"
+	description="Are you sure you want to delete all conversations? This action cannot be undone and will permanently remove all your conversations and messages."
+	confirmText="Delete All"
+	cancelText="Cancel"
+	variant="destructive"
+	icon={Trash2}
+	onConfirm={handleDeleteAllConfirm}
+	onCancel={handleDeleteAllCancel}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsParameterSourceIndicator.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsParameterSourceIndicator.svelte
new file mode 100644
index 0000000..b566985
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsParameterSourceIndicator.svelte
@@ -0,0 +1,18 @@
+<script lang="ts">
+	import { Wrench } from '@lucide/svelte';
+	import { Badge } from '$lib/components/ui/badge';
+
+	interface Props {
+		class?: string;
+	}
+
+	let { class: className = '' }: Props = $props();
+</script>
+
+<Badge
+	variant="secondary"
+	class="h-5 bg-orange-100 px-1.5 py-0.5 text-xs text-orange-800 dark:bg-orange-900 dark:text-orange-200 {className}"
+>
+	<Wrench class="mr-1 h-3 w-3" />
+	Custom
+</Badge>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte
new file mode 100644
index 0000000..aa0c27f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte
@@ -0,0 +1,211 @@
+<script lang="ts">
+	import { goto } from '$app/navigation';
+	import { page } from '$app/state';
+	import { Trash2 } from '@lucide/svelte';
+	import { ChatSidebarConversationItem, DialogConfirmation } from '$lib/components/app';
+	import ScrollArea from '$lib/components/ui/scroll-area/scroll-area.svelte';
+	import * as Sidebar from '$lib/components/ui/sidebar';
+	import * as AlertDialog from '$lib/components/ui/alert-dialog';
+	import Input from '$lib/components/ui/input/input.svelte';
+	import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
+	import { chatStore } from '$lib/stores/chat.svelte';
+	import { getPreviewText } from '$lib/utils/text';
+	import ChatSidebarActions from './ChatSidebarActions.svelte';
+
+	const sidebar = Sidebar.useSidebar();
+
+	let currentChatId = $derived(page.params.id);
+	let isSearchModeActive = $state(false);
+	let searchQuery = $state('');
+	let showDeleteDialog = $state(false);
+	let showEditDialog = $state(false);
+	let selectedConversation = $state<DatabaseConversation | null>(null);
+	let editedName = $state('');
+	let selectedConversationNamePreview = $derived.by(() =>
+		selectedConversation ? getPreviewText(selectedConversation.name) : ''
+	);
+
+	let filteredConversations = $derived.by(() => {
+		if (searchQuery.trim().length > 0) {
+			return conversations().filter((conversation: { name: string }) =>
+				conversation.name.toLowerCase().includes(searchQuery.toLowerCase())
+			);
+		}
+
+		return conversations();
+	});
+
+	async function handleDeleteConversation(id: string) {
+		const conversation = conversations().find((conv) => conv.id === id);
+		if (conversation) {
+			selectedConversation = conversation;
+			showDeleteDialog = true;
+		}
+	}
+
+	async function handleEditConversation(id: string) {
+		const conversation = conversations().find((conv) => conv.id === id);
+		if (conversation) {
+			selectedConversation = conversation;
+			editedName = conversation.name;
+			showEditDialog = true;
+		}
+	}
+
+	function handleConfirmDelete() {
+		if (selectedConversation) {
+			showDeleteDialog = false;
+
+			setTimeout(() => {
+				conversationsStore.deleteConversation(selectedConversation.id);
+				selectedConversation = null;
+			}, 100); // Wait for animation to finish
+		}
+	}
+
+	function handleConfirmEdit() {
+		if (!editedName.trim() || !selectedConversation) return;
+
+		showEditDialog = false;
+
+		conversationsStore.updateConversationName(selectedConversation.id, editedName);
+		selectedConversation = null;
+	}
+
+	export function handleMobileSidebarItemClick() {
+		if (sidebar.isMobile) {
+			sidebar.toggle();
+		}
+	}
+
+	export function activateSearchMode() {
+		isSearchModeActive = true;
+	}
+
+	export function editActiveConversation() {
+		if (currentChatId) {
+			const activeConversation = filteredConversations.find((conv) => conv.id === currentChatId);
+
+			if (activeConversation) {
+				const event = new CustomEvent('edit-active-conversation', {
+					detail: { conversationId: currentChatId }
+				});
+				document.dispatchEvent(event);
+			}
+		}
+	}
+
+	async function selectConversation(id: string) {
+		if (isSearchModeActive) {
+			isSearchModeActive = false;
+			searchQuery = '';
+		}
+
+		await goto(`#/chat/${id}`);
+	}
+
+	function handleStopGeneration(id: string) {
+		chatStore.stopGenerationForChat(id);
+	}
+</script>
+
+<ScrollArea class="h-[100vh]">
+	<Sidebar.Header class=" top-0 z-10 gap-6 bg-sidebar/50 px-4 py-4 pb-2 backdrop-blur-lg md:sticky">
+		<a href="#/" onclick={handleMobileSidebarItemClick}>
+			<h1 class="inline-flex items-center gap-1 px-2 text-xl font-semibold">llama.cpp</h1>
+		</a>
+
+		<ChatSidebarActions {handleMobileSidebarItemClick} bind:isSearchModeActive bind:searchQuery />
+	</Sidebar.Header>
+
+	<Sidebar.Group class="mt-4 space-y-2 p-0 px-4">
+		{#if (filteredConversations.length > 0 && isSearchModeActive) || !isSearchModeActive}
+			<Sidebar.GroupLabel>
+				{isSearchModeActive ? 'Search results' : 'Conversations'}
+			</Sidebar.GroupLabel>
+		{/if}
+
+		<Sidebar.GroupContent>
+			<Sidebar.Menu>
+				{#each filteredConversations as conversation (conversation.id)}
+					<Sidebar.MenuItem class="mb-1">
+						<ChatSidebarConversationItem
+							conversation={{
+								id: conversation.id,
+								name: conversation.name,
+								lastModified: conversation.lastModified,
+								currNode: conversation.currNode
+							}}
+							{handleMobileSidebarItemClick}
+							isActive={currentChatId === conversation.id}
+							onSelect={selectConversation}
+							onEdit={handleEditConversation}
+							onDelete={handleDeleteConversation}
+							onStop={handleStopGeneration}
+						/>
+					</Sidebar.MenuItem>
+				{/each}
+
+				{#if filteredConversations.length === 0}
+					<div class="px-2 py-4 text-center">
+						<p class="mb-4 p-4 text-sm text-muted-foreground">
+							{searchQuery.length > 0
+								? 'No results found'
+								: isSearchModeActive
+									? 'Start typing to see results'
+									: 'No conversations yet'}
+						</p>
+					</div>
+				{/if}
+			</Sidebar.Menu>
+		</Sidebar.GroupContent>
+	</Sidebar.Group>
+</ScrollArea>
+
+<DialogConfirmation
+	bind:open={showDeleteDialog}
+	title="Delete Conversation"
+	description={selectedConversation
+		? `Are you sure you want to delete "${selectedConversationNamePreview}"? This action cannot be undone and will permanently remove all messages in this conversation.`
+		: ''}
+	confirmText="Delete"
+	cancelText="Cancel"
+	variant="destructive"
+	icon={Trash2}
+	onConfirm={handleConfirmDelete}
+	onCancel={() => {
+		showDeleteDialog = false;
+		selectedConversation = null;
+	}}
+/>
+
+<AlertDialog.Root bind:open={showEditDialog}>
+	<AlertDialog.Content>
+		<AlertDialog.Header>
+			<AlertDialog.Title>Edit Conversation Name</AlertDialog.Title>
+			<AlertDialog.Description>
+				<Input
+					class="mt-4 text-foreground"
+					onkeydown={(e) => {
+						if (e.key === 'Enter') {
+							e.preventDefault();
+							handleConfirmEdit();
+						}
+					}}
+					placeholder="Enter a new name"
+					type="text"
+					bind:value={editedName}
+				/>
+			</AlertDialog.Description>
+		</AlertDialog.Header>
+		<AlertDialog.Footer>
+			<AlertDialog.Cancel
+				onclick={() => {
+					showEditDialog = false;
+					selectedConversation = null;
+				}}>Cancel</AlertDialog.Cancel
+			>
+			<AlertDialog.Action onclick={handleConfirmEdit}>Save</AlertDialog.Action>
+		</AlertDialog.Footer>
+	</AlertDialog.Content>
+</AlertDialog.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarActions.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarActions.svelte
new file mode 100644
index 0000000..30d1f9d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarActions.svelte
@@ -0,0 +1,81 @@
+<script lang="ts">
+	import { Search, SquarePen, X } from '@lucide/svelte';
+	import { KeyboardShortcutInfo } from '$lib/components/app';
+	import { Button } from '$lib/components/ui/button';
+	import { Input } from '$lib/components/ui/input';
+
+	interface Props {
+		handleMobileSidebarItemClick: () => void;
+		isSearchModeActive: boolean;
+		searchQuery: string;
+	}
+
+	let {
+		handleMobileSidebarItemClick,
+		isSearchModeActive = $bindable(),
+		searchQuery = $bindable()
+	}: Props = $props();
+
+	let searchInput: HTMLInputElement | null = $state(null);
+
+	function handleSearchModeDeactivate() {
+		isSearchModeActive = false;
+		searchQuery = '';
+	}
+
+	$effect(() => {
+		if (isSearchModeActive) {
+			searchInput?.focus();
+		}
+	});
+</script>
+
+<div class="space-y-0.5">
+	{#if isSearchModeActive}
+		<div class="relative">
+			<Search class="absolute top-2.5 left-2 h-4 w-4 text-muted-foreground" />
+
+			<Input
+				bind:ref={searchInput}
+				bind:value={searchQuery}
+				onkeydown={(e) => e.key === 'Escape' && handleSearchModeDeactivate()}
+				placeholder="Search conversations..."
+				class="pl-8"
+			/>
+
+			<X
+				class="cursor-pointertext-muted-foreground absolute top-2.5 right-2 h-4 w-4"
+				onclick={handleSearchModeDeactivate}
+			/>
+		</div>
+	{:else}
+		<Button
+			class="w-full justify-between hover:[&>kbd]:opacity-100"
+			href="?new_chat=true#/"
+			onclick={handleMobileSidebarItemClick}
+			variant="ghost"
+		>
+			<div class="flex items-center gap-2">
+				<SquarePen class="h-4 w-4" />
+				New chat
+			</div>
+
+			<KeyboardShortcutInfo keys={['shift', 'cmd', 'o']} />
+		</Button>
+
+		<Button
+			class="w-full justify-between hover:[&>kbd]:opacity-100"
+			onclick={() => {
+				isSearchModeActive = true;
+			}}
+			variant="ghost"
+		>
+			<div class="flex items-center gap-2">
+				<Search class="h-4 w-4" />
+				Search conversations
+			</div>
+
+			<KeyboardShortcutInfo keys={['cmd', 'k']} />
+		</Button>
+	{/if}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarConversationItem.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarConversationItem.svelte
new file mode 100644
index 0000000..bf2fa4f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarConversationItem.svelte
@@ -0,0 +1,200 @@
+<script lang="ts">
+	import { Trash2, Pencil, MoreHorizontal, Download, Loader2, Square } from '@lucide/svelte';
+	import { ActionDropdown } from '$lib/components/app';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import { getAllLoadingChats } from '$lib/stores/chat.svelte';
+	import { conversationsStore } from '$lib/stores/conversations.svelte';
+	import { onMount } from 'svelte';
+
+	interface Props {
+		isActive?: boolean;
+		conversation: DatabaseConversation;
+		handleMobileSidebarItemClick?: () => void;
+		onDelete?: (id: string) => void;
+		onEdit?: (id: string) => void;
+		onSelect?: (id: string) => void;
+		onStop?: (id: string) => void;
+	}
+
+	let {
+		conversation,
+		handleMobileSidebarItemClick,
+		onDelete,
+		onEdit,
+		onSelect,
+		onStop,
+		isActive = false
+	}: Props = $props();
+
+	let renderActionsDropdown = $state(false);
+	let dropdownOpen = $state(false);
+
+	let isLoading = $derived(getAllLoadingChats().includes(conversation.id));
+
+	function handleEdit(event: Event) {
+		event.stopPropagation();
+		onEdit?.(conversation.id);
+	}
+
+	function handleDelete(event: Event) {
+		event.stopPropagation();
+		onDelete?.(conversation.id);
+	}
+
+	function handleStop(event: Event) {
+		event.stopPropagation();
+		onStop?.(conversation.id);
+	}
+
+	function handleGlobalEditEvent(event: Event) {
+		const customEvent = event as CustomEvent<{ conversationId: string }>;
+
+		if (customEvent.detail.conversationId === conversation.id && isActive) {
+			handleEdit(event);
+		}
+	}
+
+	function handleMouseLeave() {
+		if (!dropdownOpen) {
+			renderActionsDropdown = false;
+		}
+	}
+
+	function handleMouseOver() {
+		renderActionsDropdown = true;
+	}
+
+	function handleSelect() {
+		onSelect?.(conversation.id);
+	}
+
+	$effect(() => {
+		if (!dropdownOpen) {
+			renderActionsDropdown = false;
+		}
+	});
+
+	onMount(() => {
+		document.addEventListener('edit-active-conversation', handleGlobalEditEvent as EventListener);
+
+		return () => {
+			document.removeEventListener(
+				'edit-active-conversation',
+				handleGlobalEditEvent as EventListener
+			);
+		};
+	});
+</script>
+
+<!-- svelte-ignore a11y_mouse_events_have_key_events -->
+<button
+	class="group flex min-h-9 w-full cursor-pointer items-center justify-between space-x-3 rounded-lg px-3 py-1.5 text-left transition-colors hover:bg-foreground/10 {isActive
+		? 'bg-foreground/5 text-accent-foreground'
+		: ''}"
+	onclick={handleSelect}
+	onmouseover={handleMouseOver}
+	onmouseleave={handleMouseLeave}
+>
+	<div class="flex min-w-0 flex-1 items-center gap-2">
+		{#if isLoading}
+			<Tooltip.Root>
+				<Tooltip.Trigger>
+					<div
+						class="stop-button flex h-4 w-4 shrink-0 cursor-pointer items-center justify-center rounded text-muted-foreground transition-colors hover:text-foreground"
+						onclick={handleStop}
+						onkeydown={(e) => e.key === 'Enter' && handleStop(e)}
+						role="button"
+						tabindex="0"
+						aria-label="Stop generation"
+					>
+						<Loader2 class="loading-icon h-3.5 w-3.5 animate-spin" />
+
+						<Square class="stop-icon hidden h-3 w-3 fill-current text-destructive" />
+					</div>
+				</Tooltip.Trigger>
+
+				<Tooltip.Content>
+					<p>Stop generation</p>
+				</Tooltip.Content>
+			</Tooltip.Root>
+		{/if}
+
+		<!-- svelte-ignore a11y_click_events_have_key_events -->
+		<!-- svelte-ignore a11y_no_static_element_interactions -->
+		<span class="truncate text-sm font-medium" onclick={handleMobileSidebarItemClick}>
+			{conversation.name}
+		</span>
+	</div>
+
+	{#if renderActionsDropdown}
+		<div class="actions flex items-center">
+			<ActionDropdown
+				triggerIcon={MoreHorizontal}
+				triggerTooltip="More actions"
+				bind:open={dropdownOpen}
+				actions={[
+					{
+						icon: Pencil,
+						label: 'Edit',
+						onclick: handleEdit,
+						shortcut: ['shift', 'cmd', 'e']
+					},
+					{
+						icon: Download,
+						label: 'Export',
+						onclick: (e) => {
+							e.stopPropagation();
+							conversationsStore.downloadConversation(conversation.id);
+						},
+						shortcut: ['shift', 'cmd', 's']
+					},
+					{
+						icon: Trash2,
+						label: 'Delete',
+						onclick: handleDelete,
+						variant: 'destructive',
+						shortcut: ['shift', 'cmd', 'd'],
+						separator: true
+					}
+				]}
+			/>
+		</div>
+	{/if}
+</button>
+
+<style>
+	button {
+		:global([data-slot='dropdown-menu-trigger']:not([data-state='open'])) {
+			opacity: 0;
+		}
+
+		&:is(:hover) :global([data-slot='dropdown-menu-trigger']) {
+			opacity: 1;
+		}
+		@media (max-width: 768px) {
+			:global([data-slot='dropdown-menu-trigger']) {
+				opacity: 1 !important;
+			}
+		}
+
+		.stop-button {
+			:global(.stop-icon) {
+				display: none;
+			}
+
+			:global(.loading-icon) {
+				display: block;
+			}
+		}
+
+		&:is(:hover) .stop-button {
+			:global(.stop-icon) {
+				display: block;
+			}
+
+			:global(.loading-icon) {
+				display: none;
+			}
+		}
+	}
+</style>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
new file mode 100644
index 0000000..afc9847
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte
@@ -0,0 +1,19 @@
+<script lang="ts">
+	import { SearchInput } from '$lib/components/app';
+
+	interface Props {
+		value?: string;
+		placeholder?: string;
+		onInput?: (value: string) => void;
+		class?: string;
+	}
+
+	let {
+		value = $bindable(''),
+		placeholder = 'Search conversations...',
+		onInput,
+		class: className
+	}: Props = $props();
+</script>
+
+<SearchInput bind:value {placeholder} {onInput} class="mb-4 {className}" />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/handle-mobile-sidebar-item-click.ts b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/handle-mobile-sidebar-item-click.ts
new file mode 100644
index 0000000..4b9b876
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/handle-mobile-sidebar-item-click.ts
@@ -0,0 +1,9 @@
+import { useSidebar } from '$lib/components/ui/sidebar';
+
+const sidebar = useSidebar();
+
+export function handleMobileSidebarItemClick() {
+	if (sidebar.isMobile) {
+		sidebar.toggle();
+	}
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentPreview.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentPreview.svelte
new file mode 100644
index 0000000..012ba00
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentPreview.svelte
@@ -0,0 +1,67 @@
+<script lang="ts">
+	import * as Dialog from '$lib/components/ui/dialog';
+	import { ChatAttachmentPreview } from '$lib/components/app';
+	import { formatFileSize } from '$lib/utils';
+
+	interface Props {
+		open: boolean;
+		onOpenChange?: (open: boolean) => void;
+		// Either an uploaded file or a stored attachment
+		uploadedFile?: ChatUploadedFile;
+		attachment?: DatabaseMessageExtra;
+		// For uploaded files
+		preview?: string;
+		name?: string;
+		size?: number;
+		textContent?: string;
+		// For vision modality check
+		activeModelId?: string;
+	}
+
+	let {
+		open = $bindable(),
+		onOpenChange,
+		uploadedFile,
+		attachment,
+		preview,
+		name,
+		size,
+		textContent,
+		activeModelId
+	}: Props = $props();
+
+	let chatAttachmentPreviewRef: ChatAttachmentPreview | undefined = $state();
+
+	let displayName = $derived(uploadedFile?.name || attachment?.name || name || 'Unknown File');
+
+	let displaySize = $derived(uploadedFile?.size || size);
+
+	$effect(() => {
+		if (open && chatAttachmentPreviewRef) {
+			chatAttachmentPreviewRef.reset();
+		}
+	});
+</script>
+
+<Dialog.Root bind:open {onOpenChange}>
+	<Dialog.Content class="grid max-h-[90vh] max-w-5xl overflow-hidden sm:w-auto sm:max-w-6xl">
+		<Dialog.Header>
+			<Dialog.Title class="pr-8">{displayName}</Dialog.Title>
+			<Dialog.Description>
+				{#if displaySize}
+					{formatFileSize(displaySize)}
+				{/if}
+			</Dialog.Description>
+		</Dialog.Header>
+
+		<ChatAttachmentPreview
+			bind:this={chatAttachmentPreviewRef}
+			{uploadedFile}
+			{attachment}
+			{preview}
+			name={displayName}
+			{textContent}
+			{activeModelId}
+		/>
+	</Dialog.Content>
+</Dialog.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentsViewAll.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentsViewAll.svelte
new file mode 100644
index 0000000..33ab0fe
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentsViewAll.svelte
@@ -0,0 +1,54 @@
+<script lang="ts">
+	import * as Dialog from '$lib/components/ui/dialog';
+	import { ChatAttachmentsViewAll } from '$lib/components/app';
+
+	interface Props {
+		open?: boolean;
+		uploadedFiles?: ChatUploadedFile[];
+		attachments?: DatabaseMessageExtra[];
+		readonly?: boolean;
+		onFileRemove?: (fileId: string) => void;
+		imageHeight?: string;
+		imageWidth?: string;
+		imageClass?: string;
+		activeModelId?: string;
+	}
+
+	let {
+		open = $bindable(false),
+		uploadedFiles = [],
+		attachments = [],
+		readonly = false,
+		onFileRemove,
+		imageHeight = 'h-24',
+		imageWidth = 'w-auto',
+		imageClass = '',
+		activeModelId
+	}: Props = $props();
+
+	let totalCount = $derived(uploadedFiles.length + attachments.length);
+</script>
+
+<Dialog.Root bind:open>
+	<Dialog.Portal>
+		<Dialog.Overlay />
+
+		<Dialog.Content class="flex !max-h-[90vh] !max-w-6xl flex-col">
+			<Dialog.Header>
+				<Dialog.Title>All Attachments ({totalCount})</Dialog.Title>
+				<Dialog.Description>View and manage all attached files</Dialog.Description>
+			</Dialog.Header>
+
+			<ChatAttachmentsViewAll
+				{uploadedFiles}
+				{attachments}
+				{readonly}
+				{onFileRemove}
+				{imageHeight}
+				{imageWidth}
+				{imageClass}
+				{activeModelId}
+			/>
+		</Dialog.Content>
+	</Dialog.Portal>
+</Dialog.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatError.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatError.svelte
new file mode 100644
index 0000000..b4340e8
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatError.svelte
@@ -0,0 +1,70 @@
+<script lang="ts">
+	import * as AlertDialog from '$lib/components/ui/alert-dialog';
+	import { AlertTriangle, TimerOff } from '@lucide/svelte';
+
+	interface Props {
+		open: boolean;
+		type: 'timeout' | 'server';
+		message: string;
+		contextInfo?: { n_prompt_tokens: number; n_ctx: number };
+		onOpenChange?: (open: boolean) => void;
+	}
+
+	let { open = $bindable(), type, message, contextInfo, onOpenChange }: Props = $props();
+
+	const isTimeout = $derived(type === 'timeout');
+	const title = $derived(isTimeout ? 'TCP Timeout' : 'Server Error');
+	const description = $derived(
+		isTimeout
+			? 'The request did not receive a response from the server before timing out.'
+			: 'The server responded with an error message. Review the details below.'
+	);
+	const iconClass = $derived(isTimeout ? 'text-destructive' : 'text-amber-500');
+	const badgeClass = $derived(
+		isTimeout
+			? 'border-destructive/40 bg-destructive/10 text-destructive'
+			: 'border-amber-500/40 bg-amber-500/10 text-amber-600 dark:text-amber-400'
+	);
+
+	function handleOpenChange(newOpen: boolean) {
+		open = newOpen;
+		onOpenChange?.(newOpen);
+	}
+</script>
+
+<AlertDialog.Root {open} onOpenChange={handleOpenChange}>
+	<AlertDialog.Content>
+		<AlertDialog.Header>
+			<AlertDialog.Title class="flex items-center gap-2">
+				{#if isTimeout}
+					<TimerOff class={`h-5 w-5 ${iconClass}`} />
+				{:else}
+					<AlertTriangle class={`h-5 w-5 ${iconClass}`} />
+				{/if}
+
+				{title}
+			</AlertDialog.Title>
+
+			<AlertDialog.Description>
+				{description}
+			</AlertDialog.Description>
+		</AlertDialog.Header>
+
+		<div class={`rounded-lg border px-4 py-3 text-sm ${badgeClass}`}>
+			<p class="font-medium">{message}</p>
+			{#if contextInfo}
+				<div class="mt-2 space-y-1 text-xs opacity-80">
+					<p>
+						<span class="font-medium">Prompt tokens:</span>
+						{contextInfo.n_prompt_tokens.toLocaleString()}
+					</p>
+					<p><span class="font-medium">Context size:</span> {contextInfo.n_ctx.toLocaleString()}</p>
+				</div>
+			{/if}
+		</div>
+
+		<AlertDialog.Footer>
+			<AlertDialog.Action onclick={() => handleOpenChange(false)}>Close</AlertDialog.Action>
+		</AlertDialog.Footer>
+	</AlertDialog.Content>
+</AlertDialog.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatSettings.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatSettings.svelte
new file mode 100644
index 0000000..e9aaa10
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatSettings.svelte
@@ -0,0 +1,37 @@
+<script lang="ts">
+	import * as Dialog from '$lib/components/ui/dialog';
+	import { ChatSettings } from '$lib/components/app';
+
+	interface Props {
+		onOpenChange?: (open: boolean) => void;
+		open?: boolean;
+	}
+
+	let { onOpenChange, open = false }: Props = $props();
+
+	let chatSettingsRef: ChatSettings | undefined = $state();
+
+	function handleClose() {
+		onOpenChange?.(false);
+	}
+
+	function handleSave() {
+		onOpenChange?.(false);
+	}
+
+	$effect(() => {
+		if (open && chatSettingsRef) {
+			chatSettingsRef.reset();
+		}
+	});
+</script>
+
+<Dialog.Root {open} onOpenChange={handleClose}>
+	<Dialog.Content
+		class="z-999999 flex h-[100dvh] max-h-[100dvh] min-h-[100dvh] flex-col gap-0 rounded-none p-0
+			md:h-[64vh] md:max-h-[64vh] md:min-h-0 md:rounded-lg"
+		style="max-width: 48rem;"
+	>
+		<ChatSettings bind:this={chatSettingsRef} onSave={handleSave} />
+	</Dialog.Content>
+</Dialog.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConfirmation.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConfirmation.svelte
new file mode 100644
index 0000000..b5175a9
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConfirmation.svelte
@@ -0,0 +1,72 @@
+<script lang="ts">
+	import * as AlertDialog from '$lib/components/ui/alert-dialog';
+	import type { Component } from 'svelte';
+
+	interface Props {
+		open: boolean;
+		title: string;
+		description: string;
+		confirmText?: string;
+		cancelText?: string;
+		variant?: 'default' | 'destructive';
+		icon?: Component;
+		onConfirm: () => void;
+		onCancel: () => void;
+		onKeydown?: (event: KeyboardEvent) => void;
+	}
+
+	let {
+		open = $bindable(),
+		title,
+		description,
+		confirmText = 'Confirm',
+		cancelText = 'Cancel',
+		variant = 'default',
+		icon,
+		onConfirm,
+		onCancel,
+		onKeydown
+	}: Props = $props();
+
+	function handleKeydown(event: KeyboardEvent) {
+		if (event.key === 'Enter') {
+			event.preventDefault();
+			onConfirm();
+		}
+		onKeydown?.(event);
+	}
+
+	function handleOpenChange(newOpen: boolean) {
+		if (!newOpen) {
+			onCancel();
+		}
+	}
+</script>
+
+<AlertDialog.Root {open} onOpenChange={handleOpenChange}>
+	<AlertDialog.Content onkeydown={handleKeydown}>
+		<AlertDialog.Header>
+			<AlertDialog.Title class="flex items-center gap-2">
+				{#if icon}
+					{@const IconComponent = icon}
+					<IconComponent class="h-5 w-5 {variant === 'destructive' ? 'text-destructive' : ''}" />
+				{/if}
+				{title}
+			</AlertDialog.Title>
+
+			<AlertDialog.Description>
+				{description}
+			</AlertDialog.Description>
+		</AlertDialog.Header>
+
+		<AlertDialog.Footer>
+			<AlertDialog.Cancel onclick={onCancel}>{cancelText}</AlertDialog.Cancel>
+			<AlertDialog.Action
+				onclick={onConfirm}
+				class={variant === 'destructive' ? 'bg-destructive text-white hover:bg-destructive/80' : ''}
+			>
+				{confirmText}
+			</AlertDialog.Action>
+		</AlertDialog.Footer>
+	</AlertDialog.Content>
+</AlertDialog.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConversationSelection.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConversationSelection.svelte
new file mode 100644
index 0000000..1f8ea64
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConversationSelection.svelte
@@ -0,0 +1,68 @@
+<script lang="ts">
+	import * as Dialog from '$lib/components/ui/dialog';
+	import { ConversationSelection } from '$lib/components/app';
+
+	interface Props {
+		conversations: DatabaseConversation[];
+		messageCountMap?: Map<string, number>;
+		mode: 'export' | 'import';
+		onCancel: () => void;
+		onConfirm: (selectedConversations: DatabaseConversation[]) => void;
+		open?: boolean;
+	}
+
+	let {
+		conversations,
+		messageCountMap = new Map(),
+		mode,
+		onCancel,
+		onConfirm,
+		open = $bindable(false)
+	}: Props = $props();
+
+	let conversationSelectionRef: ConversationSelection | undefined = $state();
+
+	let previousOpen = $state(false);
+
+	$effect(() => {
+		if (open && !previousOpen && conversationSelectionRef) {
+			conversationSelectionRef.reset();
+		} else if (!open && previousOpen) {
+			onCancel();
+		}
+
+		previousOpen = open;
+	});
+</script>
+
+<Dialog.Root bind:open>
+	<Dialog.Portal>
+		<Dialog.Overlay class="z-[1000000]" />
+
+		<Dialog.Content class="z-[1000001] max-w-2xl">
+			<Dialog.Header>
+				<Dialog.Title>
+					Select Conversations to {mode === 'export' ? 'Export' : 'Import'}
+				</Dialog.Title>
+				<Dialog.Description>
+					{#if mode === 'export'}
+						Choose which conversations you want to export. Selected conversations will be downloaded
+						as a JSON file.
+					{:else}
+						Choose which conversations you want to import. Selected conversations will be merged
+						with your existing conversations.
+					{/if}
+				</Dialog.Description>
+			</Dialog.Header>
+
+			<ConversationSelection
+				bind:this={conversationSelectionRef}
+				{conversations}
+				{messageCountMap}
+				{mode}
+				{onCancel}
+				{onConfirm}
+			/>
+		</Dialog.Content>
+	</Dialog.Portal>
+</Dialog.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConversationTitleUpdate.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConversationTitleUpdate.svelte
new file mode 100644
index 0000000..4a9ecce
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConversationTitleUpdate.svelte
@@ -0,0 +1,46 @@
+<script lang="ts">
+	import * as AlertDialog from '$lib/components/ui/alert-dialog';
+	import { Button } from '$lib/components/ui/button';
+
+	interface Props {
+		open: boolean;
+		currentTitle: string;
+		newTitle: string;
+		onConfirm: () => void;
+		onCancel: () => void;
+	}
+
+	let { open = $bindable(), currentTitle, newTitle, onConfirm, onCancel }: Props = $props();
+</script>
+
+<AlertDialog.Root bind:open>
+	<AlertDialog.Content>
+		<AlertDialog.Header>
+			<AlertDialog.Title>Update Conversation Title?</AlertDialog.Title>
+
+			<AlertDialog.Description>
+				Do you want to update the conversation title to match the first message content?
+			</AlertDialog.Description>
+		</AlertDialog.Header>
+
+		<div class="space-y-4 pt-2 pb-6">
+			<div class="space-y-2">
+				<p class="text-sm font-medium text-muted-foreground">Current title:</p>
+
+				<p class="rounded-md bg-muted/50 p-3 text-sm font-medium">{currentTitle}</p>
+			</div>
+
+			<div class="space-y-2">
+				<p class="text-sm font-medium text-muted-foreground">New title would be:</p>
+
+				<p class="rounded-md bg-muted/50 p-3 text-sm font-medium">{newTitle}</p>
+			</div>
+		</div>
+
+		<AlertDialog.Footer>
+			<Button variant="outline" onclick={onCancel}>Keep Current Title</Button>
+
+			<Button onclick={onConfirm}>Update Title</Button>
+		</AlertDialog.Footer>
+	</AlertDialog.Content>
+</AlertDialog.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogEmptyFileAlert.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogEmptyFileAlert.svelte
new file mode 100644
index 0000000..f875b0a
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogEmptyFileAlert.svelte
@@ -0,0 +1,61 @@
+<script lang="ts">
+	import * as AlertDialog from '$lib/components/ui/alert-dialog';
+	import { FileX } from '@lucide/svelte';
+
+	interface Props {
+		open: boolean;
+		emptyFiles: string[];
+		onOpenChange?: (open: boolean) => void;
+	}
+
+	let { open = $bindable(), emptyFiles, onOpenChange }: Props = $props();
+
+	function handleOpenChange(newOpen: boolean) {
+		open = newOpen;
+		onOpenChange?.(newOpen);
+	}
+</script>
+
+<AlertDialog.Root {open} onOpenChange={handleOpenChange}>
+	<AlertDialog.Content>
+		<AlertDialog.Header>
+			<AlertDialog.Title class="flex items-center gap-2">
+				<FileX class="h-5 w-5 text-destructive" />
+
+				Empty Files Detected
+			</AlertDialog.Title>
+
+			<AlertDialog.Description>
+				The following files are empty and have been removed from your attachments:
+			</AlertDialog.Description>
+		</AlertDialog.Header>
+
+		<div class="space-y-3 text-sm">
+			<div class="rounded-lg bg-muted p-3">
+				<div class="mb-2 font-medium">Empty Files:</div>
+
+				<ul class="list-inside list-disc space-y-1 text-muted-foreground">
+					{#each emptyFiles as fileName (fileName)}
+						<li class="font-mono text-sm">{fileName}</li>
+					{/each}
+				</ul>
+			</div>
+
+			<div>
+				<div class="mb-2 font-medium">What happened:</div>
+
+				<ul class="list-inside list-disc space-y-1 text-muted-foreground">
+					<li>Empty files cannot be processed or sent to the AI model</li>
+
+					<li>These files have been automatically removed from your attachments</li>
+
+					<li>You can try uploading files with content instead</li>
+				</ul>
+			</div>
+		</div>
+
+		<AlertDialog.Footer>
+			<AlertDialog.Action onclick={() => handleOpenChange(false)}>Got it</AlertDialog.Action>
+		</AlertDialog.Footer>
+	</AlertDialog.Content>
+</AlertDialog.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogModelInformation.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogModelInformation.svelte
new file mode 100644
index 0000000..dfea47c
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogModelInformation.svelte
@@ -0,0 +1,211 @@
+<script lang="ts">
+	import * as Dialog from '$lib/components/ui/dialog';
+	import * as Table from '$lib/components/ui/table';
+	import { BadgeModality, CopyToClipboardIcon } from '$lib/components/app';
+	import { serverStore } from '$lib/stores/server.svelte';
+	import { modelsStore, modelOptions, modelsLoading } from '$lib/stores/models.svelte';
+	import { formatFileSize, formatParameters, formatNumber } from '$lib/utils';
+
+	interface Props {
+		open?: boolean;
+		onOpenChange?: (open: boolean) => void;
+	}
+
+	let { open = $bindable(), onOpenChange }: Props = $props();
+
+	let serverProps = $derived(serverStore.props);
+	let modelName = $derived(modelsStore.singleModelName);
+	let models = $derived(modelOptions());
+	let isLoadingModels = $derived(modelsLoading());
+
+	// Get the first model for single-model mode display
+	let firstModel = $derived(models[0] ?? null);
+
+	// Get modalities from modelStore using the model ID from the first model
+	let modalities = $derived.by(() => {
+		if (!firstModel?.id) return [];
+		return modelsStore.getModelModalitiesArray(firstModel.id);
+	});
+
+	// Ensure models are fetched when dialog opens
+	$effect(() => {
+		if (open && models.length === 0) {
+			modelsStore.fetch();
+		}
+	});
+</script>
+
+<Dialog.Root bind:open {onOpenChange}>
+	<Dialog.Content class="@container z-9999 !max-w-[60rem] max-w-full">
+		<style>
+			@container (max-width: 56rem) {
+				.resizable-text-container {
+					max-width: calc(100vw - var(--threshold));
+				}
+			}
+		</style>
+
+		<Dialog.Header>
+			<Dialog.Title>Model Information</Dialog.Title>
+			<Dialog.Description>Current model details and capabilities</Dialog.Description>
+		</Dialog.Header>
+
+		<div class="space-y-6 py-4">
+			{#if isLoadingModels}
+				<div class="flex items-center justify-center py-8">
+					<div class="text-sm text-muted-foreground">Loading model information...</div>
+				</div>
+			{:else if firstModel}
+				{@const modelMeta = firstModel.meta}
+
+				{#if serverProps}
+					<Table.Root>
+						<Table.Header>
+							<Table.Row>
+								<Table.Head class="w-[10rem]">Model</Table.Head>
+
+								<Table.Head>
+									<div class="inline-flex items-center gap-2">
+										<span
+											class="resizable-text-container min-w-0 flex-1 truncate"
+											style:--threshold="12rem"
+										>
+											{modelName}
+										</span>
+
+										<CopyToClipboardIcon
+											text={modelName || ''}
+											canCopy={!!modelName}
+											ariaLabel="Copy model name to clipboard"
+										/>
+									</div>
+								</Table.Head>
+							</Table.Row>
+						</Table.Header>
+						<Table.Body>
+							<!-- Model Path -->
+							<Table.Row>
+								<Table.Cell class="h-10 align-middle font-medium">File Path</Table.Cell>
+
+								<Table.Cell
+									class="inline-flex h-10 items-center gap-2 align-middle font-mono text-xs"
+								>
+									<span
+										class="resizable-text-container min-w-0 flex-1 truncate"
+										style:--threshold="14rem"
+									>
+										{serverProps.model_path}
+									</span>
+
+									<CopyToClipboardIcon
+										text={serverProps.model_path}
+										ariaLabel="Copy model path to clipboard"
+									/>
+								</Table.Cell>
+							</Table.Row>
+
+							<!-- Context Size -->
+							<Table.Row>
+								<Table.Cell class="h-10 align-middle font-medium">Context Size</Table.Cell>
+								<Table.Cell
+									>{formatNumber(serverProps.default_generation_settings.n_ctx)} tokens</Table.Cell
+								>
+							</Table.Row>
+
+							<!-- Training Context -->
+							{#if modelMeta?.n_ctx_train}
+								<Table.Row>
+									<Table.Cell class="h-10 align-middle font-medium">Training Context</Table.Cell>
+									<Table.Cell>{formatNumber(modelMeta.n_ctx_train)} tokens</Table.Cell>
+								</Table.Row>
+							{/if}
+
+							<!-- Model Size -->
+							{#if modelMeta?.size}
+								<Table.Row>
+									<Table.Cell class="h-10 align-middle font-medium">Model Size</Table.Cell>
+									<Table.Cell>{formatFileSize(modelMeta.size)}</Table.Cell>
+								</Table.Row>
+							{/if}
+
+							<!-- Parameters -->
+							{#if modelMeta?.n_params}
+								<Table.Row>
+									<Table.Cell class="h-10 align-middle font-medium">Parameters</Table.Cell>
+									<Table.Cell>{formatParameters(modelMeta.n_params)}</Table.Cell>
+								</Table.Row>
+							{/if}
+
+							<!-- Embedding Size -->
+							{#if modelMeta?.n_embd}
+								<Table.Row>
+									<Table.Cell class="align-middle font-medium">Embedding Size</Table.Cell>
+									<Table.Cell>{formatNumber(modelMeta.n_embd)}</Table.Cell>
+								</Table.Row>
+							{/if}
+
+							<!-- Vocabulary Size -->
+							{#if modelMeta?.n_vocab}
+								<Table.Row>
+									<Table.Cell class="align-middle font-medium">Vocabulary Size</Table.Cell>
+									<Table.Cell>{formatNumber(modelMeta.n_vocab)} tokens</Table.Cell>
+								</Table.Row>
+							{/if}
+
+							<!-- Vocabulary Type -->
+							{#if modelMeta?.vocab_type}
+								<Table.Row>
+									<Table.Cell class="align-middle font-medium">Vocabulary Type</Table.Cell>
+									<Table.Cell class="align-middle capitalize">{modelMeta.vocab_type}</Table.Cell>
+								</Table.Row>
+							{/if}
+
+							<!-- Total Slots -->
+							<Table.Row>
+								<Table.Cell class="align-middle font-medium">Parallel Slots</Table.Cell>
+								<Table.Cell>{serverProps.total_slots}</Table.Cell>
+							</Table.Row>
+
+							<!-- Modalities -->
+							{#if modalities.length > 0}
+								<Table.Row>
+									<Table.Cell class="align-middle font-medium">Modalities</Table.Cell>
+									<Table.Cell>
+										<div class="flex flex-wrap gap-1">
+											<BadgeModality {modalities} />
+										</div>
+									</Table.Cell>
+								</Table.Row>
+							{/if}
+
+							<!-- Build Info -->
+							<Table.Row>
+								<Table.Cell class="align-middle font-medium">Build Info</Table.Cell>
+								<Table.Cell class="align-middle font-mono text-xs"
+									>{serverProps.build_info}</Table.Cell
+								>
+							</Table.Row>
+
+							<!-- Chat Template -->
+							{#if serverProps.chat_template}
+								<Table.Row>
+									<Table.Cell class="align-middle font-medium">Chat Template</Table.Cell>
+									<Table.Cell class="py-10">
+										<div class="max-h-120 overflow-y-auto rounded-md bg-muted p-4">
+											<pre
+												class="font-mono text-xs whitespace-pre-wrap">{serverProps.chat_template}</pre>
+										</div>
+									</Table.Cell>
+								</Table.Row>
+							{/if}
+						</Table.Body>
+					</Table.Root>
+				{/if}
+			{:else if !isLoadingModels}
+				<div class="flex items-center justify-center py-8">
+					<div class="text-sm text-muted-foreground">No model information available</div>
+				</div>
+			{/if}
+		</div>
+	</Dialog.Content>
+</Dialog.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogModelNotAvailable.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogModelNotAvailable.svelte
new file mode 100644
index 0000000..a6c2029
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogModelNotAvailable.svelte
@@ -0,0 +1,76 @@
+<script lang="ts">
+	import * as AlertDialog from '$lib/components/ui/alert-dialog';
+	import { AlertTriangle, ArrowRight } from '@lucide/svelte';
+	import { goto } from '$app/navigation';
+	import { page } from '$app/state';
+
+	interface Props {
+		open: boolean;
+		modelName: string;
+		availableModels?: string[];
+		onOpenChange?: (open: boolean) => void;
+	}
+
+	let { open = $bindable(), modelName, availableModels = [], onOpenChange }: Props = $props();
+
+	function handleOpenChange(newOpen: boolean) {
+		open = newOpen;
+		onOpenChange?.(newOpen);
+	}
+
+	function handleSelectModel(model: string) {
+		// Build URL with selected model, preserving other params
+		const url = new URL(page.url);
+		url.searchParams.set('model', model);
+
+		handleOpenChange(false);
+		goto(url.toString());
+	}
+</script>
+
+<AlertDialog.Root {open} onOpenChange={handleOpenChange}>
+	<AlertDialog.Content class="max-w-lg">
+		<AlertDialog.Header>
+			<AlertDialog.Title class="flex items-center gap-2">
+				<AlertTriangle class="h-5 w-5 text-amber-500" />
+				Model Not Available
+			</AlertDialog.Title>
+
+			<AlertDialog.Description>
+				The requested model could not be found. Select an available model to continue.
+			</AlertDialog.Description>
+		</AlertDialog.Header>
+
+		<div class="space-y-3">
+			<div class="rounded-lg border border-amber-500/40 bg-amber-500/10 px-4 py-3 text-sm">
+				<p class="font-medium text-amber-600 dark:text-amber-400">
+					Requested: <code class="rounded bg-amber-500/20 px-1.5 py-0.5">{modelName}</code>
+				</p>
+			</div>
+
+			{#if availableModels.length > 0}
+				<div class="text-sm">
+					<p class="mb-2 font-medium text-muted-foreground">Select an available model:</p>
+					<div class="max-h-48 space-y-1 overflow-y-auto rounded-md border p-1">
+						{#each availableModels as model (model)}
+							<button
+								type="button"
+								class="group flex w-full items-center justify-between gap-2 rounded-sm px-3 py-2 text-left text-sm transition-colors hover:bg-accent hover:text-accent-foreground"
+								onclick={() => handleSelectModel(model)}
+							>
+								<span class="min-w-0 truncate font-mono text-xs">{model}</span>
+								<ArrowRight
+									class="h-4 w-4 shrink-0 text-muted-foreground opacity-0 transition-opacity group-hover:opacity-100"
+								/>
+							</button>
+						{/each}
+					</div>
+				</div>
+			{/if}
+		</div>
+
+		<AlertDialog.Footer>
+			<AlertDialog.Action onclick={() => handleOpenChange(false)}>Cancel</AlertDialog.Action>
+		</AlertDialog.Footer>
+	</AlertDialog.Content>
+</AlertDialog.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/index.ts b/llama.cpp/tools/server/webui/src/lib/components/app/index.ts
new file mode 100644
index 0000000..8631d4f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/index.ts
@@ -0,0 +1,75 @@
+// Chat
+
+export { default as ChatAttachmentPreview } from './chat/ChatAttachments/ChatAttachmentPreview.svelte';
+export { default as ChatAttachmentThumbnailFile } from './chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte';
+export { default as ChatAttachmentThumbnailImage } from './chat/ChatAttachments/ChatAttachmentThumbnailImage.svelte';
+export { default as ChatAttachmentsList } from './chat/ChatAttachments/ChatAttachmentsList.svelte';
+export { default as ChatAttachmentsViewAll } from './chat/ChatAttachments/ChatAttachmentsViewAll.svelte';
+
+export { default as ChatForm } from './chat/ChatForm/ChatForm.svelte';
+export { default as ChatFormActionFileAttachments } from './chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte';
+export { default as ChatFormActionRecord } from './chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte';
+export { default as ChatFormActions } from './chat/ChatForm/ChatFormActions/ChatFormActions.svelte';
+export { default as ChatFormActionSubmit } from './chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte';
+export { default as ChatFormFileInputInvisible } from './chat/ChatForm/ChatFormFileInputInvisible.svelte';
+export { default as ChatFormHelperText } from './chat/ChatForm/ChatFormHelperText.svelte';
+export { default as ChatFormTextarea } from './chat/ChatForm/ChatFormTextarea.svelte';
+
+export { default as ChatMessage } from './chat/ChatMessages/ChatMessage.svelte';
+export { default as ChatMessageActions } from './chat/ChatMessages/ChatMessageActions.svelte';
+export { default as ChatMessageBranchingControls } from './chat/ChatMessages/ChatMessageBranchingControls.svelte';
+export { default as ChatMessageStatistics } from './chat/ChatMessages/ChatMessageStatistics.svelte';
+export { default as ChatMessageSystem } from './chat/ChatMessages/ChatMessageSystem.svelte';
+export { default as ChatMessageThinkingBlock } from './chat/ChatMessages/ChatMessageThinkingBlock.svelte';
+export { default as ChatMessages } from './chat/ChatMessages/ChatMessages.svelte';
+export { default as MessageBranchingControls } from './chat/ChatMessages/ChatMessageBranchingControls.svelte';
+
+export { default as ChatScreen } from './chat/ChatScreen/ChatScreen.svelte';
+export { default as ChatScreenHeader } from './chat/ChatScreen/ChatScreenHeader.svelte';
+export { default as ChatScreenProcessingInfo } from './chat/ChatScreen/ChatScreenProcessingInfo.svelte';
+
+export { default as ChatSettings } from './chat/ChatSettings/ChatSettings.svelte';
+export { default as ChatSettingsFooter } from './chat/ChatSettings/ChatSettingsFooter.svelte';
+export { default as ChatSettingsFields } from './chat/ChatSettings/ChatSettingsFields.svelte';
+export { default as ChatSettingsImportExportTab } from './chat/ChatSettings/ChatSettingsImportExportTab.svelte';
+export { default as ChatSettingsParameterSourceIndicator } from './chat/ChatSettings/ChatSettingsParameterSourceIndicator.svelte';
+
+export { default as ChatSidebar } from './chat/ChatSidebar/ChatSidebar.svelte';
+export { default as ChatSidebarConversationItem } from './chat/ChatSidebar/ChatSidebarConversationItem.svelte';
+export { default as ChatSidebarSearch } from './chat/ChatSidebar/ChatSidebarSearch.svelte';
+
+// Dialogs
+
+export { default as DialogChatAttachmentPreview } from './dialogs/DialogChatAttachmentPreview.svelte';
+export { default as DialogChatAttachmentsViewAll } from './dialogs/DialogChatAttachmentsViewAll.svelte';
+export { default as DialogChatError } from './dialogs/DialogChatError.svelte';
+export { default as DialogChatSettings } from './dialogs/DialogChatSettings.svelte';
+export { default as DialogConfirmation } from './dialogs/DialogConfirmation.svelte';
+export { default as DialogConversationSelection } from './dialogs/DialogConversationSelection.svelte';
+export { default as DialogConversationTitleUpdate } from './dialogs/DialogConversationTitleUpdate.svelte';
+export { default as DialogEmptyFileAlert } from './dialogs/DialogEmptyFileAlert.svelte';
+export { default as DialogModelInformation } from './dialogs/DialogModelInformation.svelte';
+export { default as DialogModelNotAvailable } from './dialogs/DialogModelNotAvailable.svelte';
+
+// Miscellanous
+
+export { default as ActionButton } from './misc/ActionButton.svelte';
+export { default as ActionDropdown } from './misc/ActionDropdown.svelte';
+export { default as BadgeChatStatistic } from './misc/BadgeChatStatistic.svelte';
+export { default as BadgeInfo } from './misc/BadgeInfo.svelte';
+export { default as ModelBadge } from './models/ModelBadge.svelte';
+export { default as BadgeModality } from './misc/BadgeModality.svelte';
+export { default as ConversationSelection } from './misc/ConversationSelection.svelte';
+export { default as CopyToClipboardIcon } from './misc/CopyToClipboardIcon.svelte';
+export { default as KeyboardShortcutInfo } from './misc/KeyboardShortcutInfo.svelte';
+export { default as MarkdownContent } from './misc/MarkdownContent.svelte';
+export { default as RemoveButton } from './misc/RemoveButton.svelte';
+export { default as SearchInput } from './misc/SearchInput.svelte';
+export { default as SyntaxHighlightedCode } from './misc/SyntaxHighlightedCode.svelte';
+export { default as ModelsSelector } from './models/ModelsSelector.svelte';
+
+// Server
+
+export { default as ServerStatus } from './server/ServerStatus.svelte';
+export { default as ServerErrorSplash } from './server/ServerErrorSplash.svelte';
+export { default as ServerLoadingSplash } from './server/ServerLoadingSplash.svelte';
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/ActionButton.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/ActionButton.svelte
new file mode 100644
index 0000000..411a8b6
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/ActionButton.svelte
@@ -0,0 +1,47 @@
+<script lang="ts">
+	import { Button } from '$lib/components/ui/button';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import type { Component } from 'svelte';
+
+	interface Props {
+		icon: Component;
+		tooltip: string;
+		variant?: 'default' | 'destructive' | 'outline' | 'secondary' | 'ghost' | 'link';
+		size?: 'default' | 'sm' | 'lg' | 'icon';
+		class?: string;
+		disabled?: boolean;
+		onclick: () => void;
+		'aria-label'?: string;
+	}
+
+	let {
+		icon,
+		tooltip,
+		variant = 'ghost',
+		size = 'sm',
+		class: className = '',
+		disabled = false,
+		onclick,
+		'aria-label': ariaLabel
+	}: Props = $props();
+</script>
+
+<Tooltip.Root>
+	<Tooltip.Trigger>
+		<Button
+			{variant}
+			{size}
+			{disabled}
+			{onclick}
+			class="h-6 w-6 p-0 {className} flex"
+			aria-label={ariaLabel || tooltip}
+		>
+			{@const IconComponent = icon}
+			<IconComponent class="h-3 w-3" />
+		</Button>
+	</Tooltip.Trigger>
+
+	<Tooltip.Content>
+		<p>{tooltip}</p>
+	</Tooltip.Content>
+</Tooltip.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/ActionDropdown.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/ActionDropdown.svelte
new file mode 100644
index 0000000..83d856d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/ActionDropdown.svelte
@@ -0,0 +1,86 @@
+<script lang="ts">
+	import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import { KeyboardShortcutInfo } from '$lib/components/app';
+	import type { Component } from 'svelte';
+
+	interface ActionItem {
+		icon: Component;
+		label: string;
+		onclick: (event: Event) => void;
+		variant?: 'default' | 'destructive';
+		disabled?: boolean;
+		shortcut?: string[];
+		separator?: boolean;
+	}
+
+	interface Props {
+		triggerIcon: Component;
+		triggerTooltip?: string;
+		triggerClass?: string;
+		actions: ActionItem[];
+		align?: 'start' | 'center' | 'end';
+		open?: boolean;
+	}
+
+	let {
+		triggerIcon,
+		triggerTooltip,
+		triggerClass = '',
+		actions,
+		align = 'end',
+		open = $bindable(false)
+	}: Props = $props();
+</script>
+
+<DropdownMenu.Root bind:open>
+	<DropdownMenu.Trigger
+		class="flex h-6 w-6 cursor-pointer items-center justify-center rounded-md p-0 text-sm font-medium transition-colors hover:bg-accent hover:text-accent-foreground focus:bg-accent focus:text-accent-foreground focus:outline-none disabled:pointer-events-none disabled:opacity-50 data-[state=open]:bg-accent data-[state=open]:text-accent-foreground {triggerClass}"
+		onclick={(e) => e.stopPropagation()}
+	>
+		{#if triggerTooltip}
+			<Tooltip.Root>
+				<Tooltip.Trigger>
+					{@render iconComponent(triggerIcon, 'h-3 w-3')}
+					<span class="sr-only">{triggerTooltip}</span>
+				</Tooltip.Trigger>
+				<Tooltip.Content>
+					<p>{triggerTooltip}</p>
+				</Tooltip.Content>
+			</Tooltip.Root>
+		{:else}
+			{@render iconComponent(triggerIcon, 'h-3 w-3')}
+		{/if}
+	</DropdownMenu.Trigger>
+
+	<DropdownMenu.Content {align} class="z-[999999] w-48">
+		{#each actions as action, index (action.label)}
+			{#if action.separator && index > 0}
+				<DropdownMenu.Separator />
+			{/if}
+
+			<DropdownMenu.Item
+				onclick={action.onclick}
+				variant={action.variant}
+				disabled={action.disabled}
+				class="flex items-center justify-between hover:[&>kbd]:opacity-100"
+			>
+				<div class="flex items-center gap-2">
+					{@render iconComponent(
+						action.icon,
+						`h-4 w-4 ${action.variant === 'destructive' ? 'text-destructive' : ''}`
+					)}
+					{action.label}
+				</div>
+
+				{#if action.shortcut}
+					<KeyboardShortcutInfo keys={action.shortcut} variant={action.variant} />
+				{/if}
+			</DropdownMenu.Item>
+		{/each}
+	</DropdownMenu.Content>
+</DropdownMenu.Root>
+
+{#snippet iconComponent(IconComponent: Component, className: string)}
+	<IconComponent class={className} />
+{/snippet}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte
new file mode 100644
index 0000000..a2b28d2
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte
@@ -0,0 +1,44 @@
+<script lang="ts">
+	import { BadgeInfo } from '$lib/components/app';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import { copyToClipboard } from '$lib/utils';
+	import type { Component } from 'svelte';
+
+	interface Props {
+		class?: string;
+		icon: Component;
+		value: string | number;
+		tooltipLabel?: string;
+	}
+
+	let { class: className = '', icon: Icon, value, tooltipLabel }: Props = $props();
+
+	function handleClick() {
+		void copyToClipboard(String(value));
+	}
+</script>
+
+{#if tooltipLabel}
+	<Tooltip.Root>
+		<Tooltip.Trigger>
+			<BadgeInfo class={className} onclick={handleClick}>
+				{#snippet icon()}
+					<Icon class="h-3 w-3" />
+				{/snippet}
+
+				{value}
+			</BadgeInfo>
+		</Tooltip.Trigger>
+		<Tooltip.Content>
+			<p>{tooltipLabel}</p>
+		</Tooltip.Content>
+	</Tooltip.Root>
+{:else}
+	<BadgeInfo class={className} onclick={handleClick}>
+		{#snippet icon()}
+			<Icon class="h-3 w-3" />
+		{/snippet}
+
+		{value}
+	</BadgeInfo>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeInfo.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeInfo.svelte
new file mode 100644
index 0000000..c70af6f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeInfo.svelte
@@ -0,0 +1,27 @@
+<script lang="ts">
+	import { cn } from '$lib/components/ui/utils';
+	import type { Snippet } from 'svelte';
+
+	interface Props {
+		children: Snippet;
+		class?: string;
+		icon?: Snippet;
+		onclick?: () => void;
+	}
+
+	let { children, class: className = '', icon, onclick }: Props = $props();
+</script>
+
+<button
+	class={cn(
+		'inline-flex cursor-pointer items-center gap-1 rounded-sm bg-muted-foreground/15 px-1.5 py-0.75',
+		className
+	)}
+	{onclick}
+>
+	{#if icon}
+		{@render icon()}
+	{/if}
+
+	{@render children()}
+</button>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeModality.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeModality.svelte
new file mode 100644
index 0000000..a0d5e86
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeModality.svelte
@@ -0,0 +1,39 @@
+<script lang="ts">
+	import { ModelModality } from '$lib/enums';
+	import { MODALITY_ICONS, MODALITY_LABELS } from '$lib/constants/icons';
+	import { cn } from '$lib/components/ui/utils';
+
+	type DisplayableModality = ModelModality.VISION | ModelModality.AUDIO;
+
+	interface Props {
+		modalities: ModelModality[];
+		class?: string;
+	}
+
+	let { modalities, class: className = '' }: Props = $props();
+
+	// Filter to only modalities that have icons (VISION, AUDIO)
+	const displayableModalities = $derived(
+		modalities.filter(
+			(m): m is DisplayableModality => m === ModelModality.VISION || m === ModelModality.AUDIO
+		)
+	);
+</script>
+
+{#each displayableModalities as modality, index (index)}
+	{@const IconComponent = MODALITY_ICONS[modality]}
+	{@const label = MODALITY_LABELS[modality]}
+
+	<span
+		class={cn(
+			'inline-flex items-center gap-1 rounded-md bg-muted px-2 py-1 text-xs font-medium',
+			className
+		)}
+	>
+		{#if IconComponent}
+			<IconComponent class="h-3 w-3" />
+		{/if}
+
+		{label}
+	</span>
+{/each}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/CodePreviewDialog.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/CodePreviewDialog.svelte
new file mode 100644
index 0000000..702519f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/CodePreviewDialog.svelte
@@ -0,0 +1,93 @@
+<script lang="ts">
+	import { Dialog as DialogPrimitive } from 'bits-ui';
+	import XIcon from '@lucide/svelte/icons/x';
+
+	interface Props {
+		open: boolean;
+		code: string;
+		language: string;
+		onOpenChange?: (open: boolean) => void;
+	}
+
+	let { open = $bindable(), code, language, onOpenChange }: Props = $props();
+
+	let iframeRef = $state<HTMLIFrameElement | null>(null);
+
+	$effect(() => {
+		if (!iframeRef) return;
+
+		if (open) {
+			iframeRef.srcdoc = code;
+		} else {
+			iframeRef.srcdoc = '';
+		}
+	});
+
+	function handleOpenChange(nextOpen: boolean) {
+		open = nextOpen;
+		onOpenChange?.(nextOpen);
+	}
+</script>
+
+<DialogPrimitive.Root {open} onOpenChange={handleOpenChange}>
+	<DialogPrimitive.Portal>
+		<DialogPrimitive.Overlay class="code-preview-overlay" />
+
+		<DialogPrimitive.Content class="code-preview-content">
+			<iframe
+				bind:this={iframeRef}
+				title="Preview {language}"
+				sandbox="allow-scripts"
+				class="code-preview-iframe"
+			></iframe>
+
+			<DialogPrimitive.Close
+				class="code-preview-close absolute top-4 right-4 border-none bg-transparent text-white opacity-70 mix-blend-difference transition-opacity hover:opacity-100 focus-visible:ring-0 focus-visible:ring-offset-0 focus-visible:outline-none disabled:pointer-events-none [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-8"
+				aria-label="Close preview"
+			>
+				<XIcon />
+				<span class="sr-only">Close preview</span>
+			</DialogPrimitive.Close>
+		</DialogPrimitive.Content>
+	</DialogPrimitive.Portal>
+</DialogPrimitive.Root>
+
+<style lang="postcss">
+	:global(.code-preview-overlay) {
+		position: fixed;
+		inset: 0;
+		background-color: transparent;
+		z-index: 100000;
+	}
+
+	:global(.code-preview-content) {
+		position: fixed;
+		inset: 0;
+		top: 0 !important;
+		left: 0 !important;
+		width: 100dvw;
+		height: 100dvh;
+		margin: 0;
+		padding: 0;
+		border: none;
+		border-radius: 0;
+		background-color: transparent;
+		box-shadow: none;
+		display: block;
+		overflow: hidden;
+		transform: none !important;
+		z-index: 100001;
+	}
+
+	:global(.code-preview-iframe) {
+		display: block;
+		width: 100dvw;
+		height: 100dvh;
+		border: 0;
+	}
+
+	:global(.code-preview-close) {
+		position: absolute;
+		z-index: 100002;
+	}
+</style>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte
new file mode 100644
index 0000000..e2095e0
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte
@@ -0,0 +1,205 @@
+<script lang="ts">
+	import { Search, X } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+	import { Input } from '$lib/components/ui/input';
+	import { Checkbox } from '$lib/components/ui/checkbox';
+	import { ScrollArea } from '$lib/components/ui/scroll-area';
+	import { SvelteSet } from 'svelte/reactivity';
+
+	interface Props {
+		conversations: DatabaseConversation[];
+		messageCountMap?: Map<string, number>;
+		mode: 'export' | 'import';
+		onCancel: () => void;
+		onConfirm: (selectedConversations: DatabaseConversation[]) => void;
+	}
+
+	let { conversations, messageCountMap = new Map(), mode, onCancel, onConfirm }: Props = $props();
+
+	let searchQuery = $state('');
+	let selectedIds = $state.raw<SvelteSet<string>>(new SvelteSet(conversations.map((c) => c.id)));
+	let lastClickedId = $state<string | null>(null);
+
+	let filteredConversations = $derived(
+		conversations.filter((conv) => {
+			const name = conv.name || 'Untitled conversation';
+			return name.toLowerCase().includes(searchQuery.toLowerCase());
+		})
+	);
+
+	let allSelected = $derived(
+		filteredConversations.length > 0 &&
+			filteredConversations.every((conv) => selectedIds.has(conv.id))
+	);
+
+	let someSelected = $derived(
+		filteredConversations.some((conv) => selectedIds.has(conv.id)) && !allSelected
+	);
+
+	function toggleConversation(id: string, shiftKey: boolean = false) {
+		const newSet = new SvelteSet(selectedIds);
+
+		if (shiftKey && lastClickedId !== null) {
+			const lastIndex = filteredConversations.findIndex((c) => c.id === lastClickedId);
+			const currentIndex = filteredConversations.findIndex((c) => c.id === id);
+
+			if (lastIndex !== -1 && currentIndex !== -1) {
+				const start = Math.min(lastIndex, currentIndex);
+				const end = Math.max(lastIndex, currentIndex);
+
+				const shouldSelect = !newSet.has(id);
+
+				for (let i = start; i <= end; i++) {
+					if (shouldSelect) {
+						newSet.add(filteredConversations[i].id);
+					} else {
+						newSet.delete(filteredConversations[i].id);
+					}
+				}
+
+				selectedIds = newSet;
+				return;
+			}
+		}
+
+		if (newSet.has(id)) {
+			newSet.delete(id);
+		} else {
+			newSet.add(id);
+		}
+
+		selectedIds = newSet;
+		lastClickedId = id;
+	}
+
+	function toggleAll() {
+		if (allSelected) {
+			const newSet = new SvelteSet(selectedIds);
+
+			filteredConversations.forEach((conv) => newSet.delete(conv.id));
+			selectedIds = newSet;
+		} else {
+			const newSet = new SvelteSet(selectedIds);
+
+			filteredConversations.forEach((conv) => newSet.add(conv.id));
+			selectedIds = newSet;
+		}
+	}
+
+	function handleConfirm() {
+		const selected = conversations.filter((conv) => selectedIds.has(conv.id));
+		onConfirm(selected);
+	}
+
+	function handleCancel() {
+		selectedIds = new SvelteSet(conversations.map((c) => c.id));
+		searchQuery = '';
+		lastClickedId = null;
+
+		onCancel();
+	}
+
+	export function reset() {
+		selectedIds = new SvelteSet(conversations.map((c) => c.id));
+		searchQuery = '';
+		lastClickedId = null;
+	}
+</script>
+
+<div class="space-y-4">
+	<div class="relative">
+		<Search class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 text-muted-foreground" />
+
+		<Input bind:value={searchQuery} placeholder="Search conversations..." class="pr-9 pl-9" />
+
+		{#if searchQuery}
+			<button
+				class="absolute top-1/2 right-3 -translate-y-1/2 text-muted-foreground hover:text-foreground"
+				onclick={() => (searchQuery = '')}
+				type="button"
+			>
+				<X class="h-4 w-4" />
+			</button>
+		{/if}
+	</div>
+
+	<div class="flex items-center justify-between text-sm text-muted-foreground">
+		<span>
+			{selectedIds.size} of {conversations.length} selected
+			{#if searchQuery}
+				({filteredConversations.length} shown)
+			{/if}
+		</span>
+	</div>
+
+	<div class="overflow-hidden rounded-md border">
+		<ScrollArea class="h-[400px]">
+			<table class="w-full">
+				<thead class="sticky top-0 z-10 bg-muted">
+					<tr class="border-b">
+						<th class="w-12 p-3 text-left">
+							<Checkbox
+								checked={allSelected}
+								indeterminate={someSelected}
+								onCheckedChange={toggleAll}
+							/>
+						</th>
+
+						<th class="p-3 text-left text-sm font-medium">Conversation Name</th>
+
+						<th class="w-32 p-3 text-left text-sm font-medium">Messages</th>
+					</tr>
+				</thead>
+				<tbody>
+					{#if filteredConversations.length === 0}
+						<tr>
+							<td colspan="3" class="p-8 text-center text-sm text-muted-foreground">
+								{#if searchQuery}
+									No conversations found matching "{searchQuery}"
+								{:else}
+									No conversations available
+								{/if}
+							</td>
+						</tr>
+					{:else}
+						{#each filteredConversations as conv (conv.id)}
+							<tr
+								class="cursor-pointer border-b transition-colors hover:bg-muted/50"
+								onclick={(e) => toggleConversation(conv.id, e.shiftKey)}
+							>
+								<td class="p-3">
+									<Checkbox
+										checked={selectedIds.has(conv.id)}
+										onclick={(e) => {
+											e.preventDefault();
+											e.stopPropagation();
+											toggleConversation(conv.id, e.shiftKey);
+										}}
+									/>
+								</td>
+
+								<td class="p-3 text-sm">
+									<div class="max-w-[17rem] truncate" title={conv.name || 'Untitled conversation'}>
+										{conv.name || 'Untitled conversation'}
+									</div>
+								</td>
+
+								<td class="p-3 text-sm text-muted-foreground">
+									{messageCountMap.get(conv.id) ?? 0}
+								</td>
+							</tr>
+						{/each}
+					{/if}
+				</tbody>
+			</table>
+		</ScrollArea>
+	</div>
+
+	<div class="flex justify-end gap-2">
+		<Button variant="outline" onclick={handleCancel}>Cancel</Button>
+
+		<Button onclick={handleConfirm} disabled={selectedIds.size === 0}>
+			{mode === 'export' ? 'Export' : 'Import'} ({selectedIds.size})
+		</Button>
+	</div>
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/CopyToClipboardIcon.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/CopyToClipboardIcon.svelte
new file mode 100644
index 0000000..bf6cd4f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/CopyToClipboardIcon.svelte
@@ -0,0 +1,18 @@
+<script lang="ts">
+	import { Copy } from '@lucide/svelte';
+	import { copyToClipboard } from '$lib/utils';
+
+	interface Props {
+		ariaLabel?: string;
+		canCopy?: boolean;
+		text: string;
+	}
+
+	let { ariaLabel = 'Copy to clipboard', canCopy = true, text }: Props = $props();
+</script>
+
+<Copy
+	class="h-3 w-3 flex-shrink-0 cursor-{canCopy ? 'pointer' : 'not-allowed'}"
+	aria-label={ariaLabel}
+	onclick={() => canCopy && copyToClipboard(text)}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte
new file mode 100644
index 0000000..5b7522f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte
@@ -0,0 +1,31 @@
+<script lang="ts">
+	import { ArrowBigUp } from '@lucide/svelte';
+
+	interface Props {
+		keys: string[];
+		variant?: 'default' | 'destructive';
+		class?: string;
+	}
+
+	let { keys, variant = 'default', class: className = '' }: Props = $props();
+
+	let baseClasses =
+		'px-1 pointer-events-none inline-flex select-none items-center gap-0.5 font-sans text-md font-medium opacity-0 transition-opacity -my-1';
+	let variantClasses = variant === 'destructive' ? 'text-destructive' : 'text-muted-foreground';
+</script>
+
+<kbd class="{baseClasses} {variantClasses} {className}">
+	{#each keys as key, index (index)}
+		{#if key === 'shift'}
+			<ArrowBigUp class="h-1 w-1 {variant === 'destructive' ? 'text-destructive' : ''} -mr-1" />
+		{:else if key === 'cmd'}
+			<span class={variant === 'destructive' ? 'text-destructive' : ''}>⌘</span>
+		{:else}
+			{key.toUpperCase()}
+		{/if}
+
+		{#if index < keys.length - 1}
+			<span> </span>
+		{/if}
+	{/each}
+</kbd>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte
new file mode 100644
index 0000000..cb3ae17
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte
@@ -0,0 +1,870 @@
+<script lang="ts">
+	import { remark } from 'remark';
+	import remarkBreaks from 'remark-breaks';
+	import remarkGfm from 'remark-gfm';
+	import remarkMath from 'remark-math';
+	import rehypeHighlight from 'rehype-highlight';
+	import remarkRehype from 'remark-rehype';
+	import rehypeKatex from 'rehype-katex';
+	import rehypeStringify from 'rehype-stringify';
+	import type { Root as HastRoot, RootContent as HastRootContent } from 'hast';
+	import type { Root as MdastRoot } from 'mdast';
+	import { browser } from '$app/environment';
+	import { onDestroy, tick } from 'svelte';
+	import { rehypeRestoreTableHtml } from '$lib/markdown/table-html-restorer';
+	import { rehypeEnhanceLinks } from '$lib/markdown/enhance-links';
+	import { rehypeEnhanceCodeBlocks } from '$lib/markdown/enhance-code-blocks';
+	import { remarkLiteralHtml } from '$lib/markdown/literal-html';
+	import { copyCodeToClipboard, preprocessLaTeX } from '$lib/utils';
+	import '$styles/katex-custom.scss';
+	import githubDarkCss from 'highlight.js/styles/github-dark.css?inline';
+	import githubLightCss from 'highlight.js/styles/github.css?inline';
+	import { mode } from 'mode-watcher';
+	import CodePreviewDialog from './CodePreviewDialog.svelte';
+
+	interface Props {
+		content: string;
+		class?: string;
+	}
+
+	interface MarkdownBlock {
+		id: string;
+		html: string;
+	}
+
+	let { content, class: className = '' }: Props = $props();
+
+	let containerRef = $state<HTMLDivElement>();
+	let renderedBlocks = $state<MarkdownBlock[]>([]);
+	let unstableBlockHtml = $state('');
+	let previewDialogOpen = $state(false);
+	let previewCode = $state('');
+	let previewLanguage = $state('text');
+
+	let pendingMarkdown: string | null = null;
+	let isProcessing = false;
+
+	const themeStyleId = `highlight-theme-${(window.idxThemeStyle = (window.idxThemeStyle ?? 0) + 1)}`;
+
+	let processor = $derived(() => {
+		return remark()
+			.use(remarkGfm) // GitHub Flavored Markdown
+			.use(remarkMath) // Parse $inline$ and $$block$$ math
+			.use(remarkBreaks) // Convert line breaks to <br>
+			.use(remarkLiteralHtml) // Treat raw HTML as literal text with preserved indentation
+			.use(remarkRehype) // Convert Markdown AST to rehype
+			.use(rehypeKatex) // Render math using KaTeX
+			.use(rehypeHighlight) // Add syntax highlighting
+			.use(rehypeRestoreTableHtml) // Restore limited HTML (e.g., <br>, <ul>) inside Markdown tables
+			.use(rehypeEnhanceLinks) // Add target="_blank" to links
+			.use(rehypeEnhanceCodeBlocks) // Wrap code blocks with header and actions
+			.use(rehypeStringify, { allowDangerousHtml: true }); // Convert to HTML string
+	});
+
+	/**
+	 * Removes click event listeners from copy and preview buttons.
+	 * Called on component destroy.
+	 */
+	function cleanupEventListeners() {
+		if (!containerRef) return;
+
+		const copyButtons = containerRef.querySelectorAll<HTMLButtonElement>('.copy-code-btn');
+		const previewButtons = containerRef.querySelectorAll<HTMLButtonElement>('.preview-code-btn');
+
+		for (const button of copyButtons) {
+			button.removeEventListener('click', handleCopyClick);
+		}
+
+		for (const button of previewButtons) {
+			button.removeEventListener('click', handlePreviewClick);
+		}
+	}
+
+	/**
+	 * Removes this component's highlight.js theme style from the document head.
+	 * Called on component destroy to clean up injected styles.
+	 */
+	function cleanupHighlightTheme() {
+		if (!browser) return;
+
+		const existingTheme = document.getElementById(themeStyleId);
+		existingTheme?.remove();
+	}
+
+	/**
+	 * Loads the appropriate highlight.js theme based on dark/light mode.
+	 * Injects a scoped style element into the document head.
+	 * @param isDark - Whether to load the dark theme (true) or light theme (false)
+	 */
+	function loadHighlightTheme(isDark: boolean) {
+		if (!browser) return;
+
+		const existingTheme = document.getElementById(themeStyleId);
+		existingTheme?.remove();
+
+		const style = document.createElement('style');
+		style.id = themeStyleId;
+		style.textContent = isDark ? githubDarkCss : githubLightCss;
+
+		document.head.appendChild(style);
+	}
+
+	/**
+	 * Extracts code information from a button click target within a code block.
+	 * @param target - The clicked button element
+	 * @returns Object with rawCode and language, or null if extraction fails
+	 */
+	function getCodeInfoFromTarget(target: HTMLElement) {
+		const wrapper = target.closest('.code-block-wrapper');
+
+		if (!wrapper) {
+			console.error('No wrapper found');
+			return null;
+		}
+
+		const codeElement = wrapper.querySelector<HTMLElement>('code[data-code-id]');
+
+		if (!codeElement) {
+			console.error('No code element found in wrapper');
+			return null;
+		}
+
+		const rawCode = codeElement.textContent ?? '';
+
+		const languageLabel = wrapper.querySelector<HTMLElement>('.code-language');
+		const language = languageLabel?.textContent?.trim() || 'text';
+
+		return { rawCode, language };
+	}
+
+	/**
+	 * Generates a unique identifier for a HAST node based on its position.
+	 * Used for stable block identification during incremental rendering.
+	 * @param node - The HAST root content node
+	 * @param indexFallback - Fallback index if position is unavailable
+	 * @returns Unique string identifier for the node
+	 */
+	function getHastNodeId(node: HastRootContent, indexFallback: number): string {
+		const position = node.position;
+
+		if (position?.start?.offset != null && position?.end?.offset != null) {
+			return `hast-${position.start.offset}-${position.end.offset}`;
+		}
+
+		return `${node.type}-${indexFallback}`;
+	}
+
+	/**
+	 * Handles click events on copy buttons within code blocks.
+	 * Copies the raw code content to the clipboard.
+	 * @param event - The click event from the copy button
+	 */
+	async function handleCopyClick(event: Event) {
+		event.preventDefault();
+		event.stopPropagation();
+
+		const target = event.currentTarget as HTMLButtonElement | null;
+
+		if (!target) {
+			return;
+		}
+
+		const info = getCodeInfoFromTarget(target);
+
+		if (!info) {
+			return;
+		}
+
+		try {
+			await copyCodeToClipboard(info.rawCode);
+		} catch (error) {
+			console.error('Failed to copy code:', error);
+		}
+	}
+
+	/**
+	 * Handles preview dialog open state changes.
+	 * Clears preview content when dialog is closed.
+	 * @param open - Whether the dialog is being opened or closed
+	 */
+	function handlePreviewDialogOpenChange(open: boolean) {
+		previewDialogOpen = open;
+
+		if (!open) {
+			previewCode = '';
+			previewLanguage = 'text';
+		}
+	}
+
+	/**
+	 * Handles click events on preview buttons within HTML code blocks.
+	 * Opens a preview dialog with the rendered HTML content.
+	 * @param event - The click event from the preview button
+	 */
+	function handlePreviewClick(event: Event) {
+		event.preventDefault();
+		event.stopPropagation();
+
+		const target = event.currentTarget as HTMLButtonElement | null;
+
+		if (!target) {
+			return;
+		}
+
+		const info = getCodeInfoFromTarget(target);
+
+		if (!info) {
+			return;
+		}
+
+		previewCode = info.rawCode;
+		previewLanguage = info.language;
+		previewDialogOpen = true;
+	}
+
+	/**
+	 * Processes markdown content into stable and unstable HTML blocks.
+	 * Uses incremental rendering: stable blocks are cached, unstable block is re-rendered.
+	 * @param markdown - The raw markdown string to process
+	 */
+	async function processMarkdown(markdown: string) {
+		if (!markdown) {
+			renderedBlocks = [];
+			unstableBlockHtml = '';
+			return;
+		}
+
+		const normalized = preprocessLaTeX(markdown);
+		const processorInstance = processor();
+		const ast = processorInstance.parse(normalized) as MdastRoot;
+		const processedRoot = (await processorInstance.run(ast)) as HastRoot;
+		const processedChildren = processedRoot.children ?? [];
+		const stableCount = Math.max(processedChildren.length - 1, 0);
+		const nextBlocks: MarkdownBlock[] = [];
+
+		for (let index = 0; index < stableCount; index++) {
+			const hastChild = processedChildren[index];
+			const id = getHastNodeId(hastChild, index);
+			const existing = renderedBlocks[index];
+
+			if (existing && existing.id === id) {
+				nextBlocks.push(existing);
+				continue;
+			}
+
+			const html = stringifyProcessedNode(
+				processorInstance,
+				processedRoot,
+				processedChildren[index]
+			);
+
+			nextBlocks.push({ id, html });
+		}
+
+		let unstableHtml = '';
+
+		if (processedChildren.length > stableCount) {
+			const unstableChild = processedChildren[stableCount];
+			unstableHtml = stringifyProcessedNode(processorInstance, processedRoot, unstableChild);
+		}
+
+		renderedBlocks = nextBlocks;
+		await tick(); // Force DOM sync before updating unstable HTML block
+		unstableBlockHtml = unstableHtml;
+	}
+
+	/**
+	 * Attaches click event listeners to copy and preview buttons in code blocks.
+	 * Uses data-listener-bound attribute to prevent duplicate bindings.
+	 */
+	function setupCodeBlockActions() {
+		if (!containerRef) return;
+
+		const wrappers = containerRef.querySelectorAll<HTMLElement>('.code-block-wrapper');
+
+		for (const wrapper of wrappers) {
+			const copyButton = wrapper.querySelector<HTMLButtonElement>('.copy-code-btn');
+			const previewButton = wrapper.querySelector<HTMLButtonElement>('.preview-code-btn');
+
+			if (copyButton && copyButton.dataset.listenerBound !== 'true') {
+				copyButton.dataset.listenerBound = 'true';
+				copyButton.addEventListener('click', handleCopyClick);
+			}
+
+			if (previewButton && previewButton.dataset.listenerBound !== 'true') {
+				previewButton.dataset.listenerBound = 'true';
+				previewButton.addEventListener('click', handlePreviewClick);
+			}
+		}
+	}
+
+	/**
+	 * Converts a single HAST node to an enhanced HTML string.
+	 * Applies link and code block enhancements to the output.
+	 * @param processorInstance - The remark/rehype processor instance
+	 * @param processedRoot - The full processed HAST root (for context)
+	 * @param child - The specific HAST child node to stringify
+	 * @returns Enhanced HTML string representation of the node
+	 */
+	function stringifyProcessedNode(
+		processorInstance: ReturnType<typeof processor>,
+		processedRoot: HastRoot,
+		child: unknown
+	) {
+		const root: HastRoot = {
+			...(processedRoot as HastRoot),
+			children: [child as never]
+		};
+
+		return processorInstance.stringify(root);
+	}
+
+	/**
+	 * Queues markdown for processing with coalescing support.
+	 * Only processes the latest markdown when multiple updates arrive quickly.
+	 * @param markdown - The markdown content to render
+	 */
+	async function updateRenderedBlocks(markdown: string) {
+		pendingMarkdown = markdown;
+
+		if (isProcessing) {
+			return;
+		}
+
+		isProcessing = true;
+
+		try {
+			while (pendingMarkdown !== null) {
+				const nextMarkdown = pendingMarkdown;
+				pendingMarkdown = null;
+
+				await processMarkdown(nextMarkdown);
+			}
+		} catch (error) {
+			console.error('Failed to process markdown:', error);
+			renderedBlocks = [];
+			unstableBlockHtml = markdown.replace(/\n/g, '<br>');
+		} finally {
+			isProcessing = false;
+		}
+	}
+
+	$effect(() => {
+		const currentMode = mode.current;
+		const isDark = currentMode === 'dark';
+
+		loadHighlightTheme(isDark);
+	});
+
+	$effect(() => {
+		updateRenderedBlocks(content);
+	});
+
+	$effect(() => {
+		const hasRenderedBlocks = renderedBlocks.length > 0;
+		const hasUnstableBlock = Boolean(unstableBlockHtml);
+
+		if ((hasRenderedBlocks || hasUnstableBlock) && containerRef) {
+			setupCodeBlockActions();
+		}
+	});
+
+	onDestroy(() => {
+		cleanupEventListeners();
+		cleanupHighlightTheme();
+	});
+</script>
+
+<div bind:this={containerRef} class={className}>
+	{#each renderedBlocks as block (block.id)}
+		<div class="markdown-block" data-block-id={block.id}>
+			<!-- eslint-disable-next-line no-at-html-tags -->
+			{@html block.html}
+		</div>
+	{/each}
+
+	{#if unstableBlockHtml}
+		<div class="markdown-block markdown-block--unstable" data-block-id="unstable">
+			<!-- eslint-disable-next-line no-at-html-tags -->
+			{@html unstableBlockHtml}
+		</div>
+	{/if}
+</div>
+
+<CodePreviewDialog
+	open={previewDialogOpen}
+	code={previewCode}
+	language={previewLanguage}
+	onOpenChange={handlePreviewDialogOpenChange}
+/>
+
+<style>
+	.markdown-block,
+	.markdown-block--unstable {
+		display: contents;
+	}
+
+	/* Base typography styles */
+	div :global(p:not(:last-child)) {
+		margin-bottom: 1rem;
+		line-height: 1.75;
+	}
+
+	div :global(:is(h1, h2, h3, h4, h5, h6):first-child) {
+		margin-top: 0;
+	}
+
+	/* Headers with consistent spacing */
+	div :global(h1) {
+		font-size: 1.875rem;
+		font-weight: 700;
+		line-height: 1.2;
+		margin: 1.5rem 0 0.75rem 0;
+	}
+
+	div :global(h2) {
+		font-size: 1.5rem;
+		font-weight: 600;
+		line-height: 1.3;
+		margin: 1.25rem 0 0.5rem 0;
+	}
+
+	div :global(h3) {
+		font-size: 1.25rem;
+		font-weight: 600;
+		margin: 1.5rem 0 0.5rem 0;
+		line-height: 1.4;
+	}
+
+	div :global(h4) {
+		font-size: 1.125rem;
+		font-weight: 600;
+		margin: 0.75rem 0 0.25rem 0;
+	}
+
+	div :global(h5) {
+		font-size: 1rem;
+		font-weight: 600;
+		margin: 0.5rem 0 0.25rem 0;
+	}
+
+	div :global(h6) {
+		font-size: 0.875rem;
+		font-weight: 600;
+		margin: 0.5rem 0 0.25rem 0;
+	}
+
+	/* Text formatting */
+	div :global(strong) {
+		font-weight: 600;
+	}
+
+	div :global(em) {
+		font-style: italic;
+	}
+
+	div :global(del) {
+		text-decoration: line-through;
+		opacity: 0.7;
+	}
+
+	/* Inline code */
+	div :global(code:not(pre code)) {
+		background: var(--muted);
+		color: var(--muted-foreground);
+		padding: 0.125rem 0.375rem;
+		border-radius: 0.375rem;
+		font-size: 0.875rem;
+		font-family:
+			ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas,
+			'Liberation Mono', Menlo, monospace;
+	}
+
+	/* Links */
+	div :global(a) {
+		color: var(--primary);
+		text-decoration: underline;
+		text-underline-offset: 2px;
+		transition: color 0.2s ease;
+	}
+
+	div :global(a:hover) {
+		color: var(--primary);
+	}
+
+	/* Lists */
+	div :global(ul) {
+		list-style-type: disc;
+		margin-left: 1.5rem;
+		margin-bottom: 1rem;
+	}
+
+	div :global(ol) {
+		list-style-type: decimal;
+		margin-left: 1.5rem;
+		margin-bottom: 1rem;
+	}
+
+	div :global(li) {
+		margin-bottom: 0.25rem;
+		padding-left: 0.5rem;
+	}
+
+	div :global(li::marker) {
+		color: var(--muted-foreground);
+	}
+
+	/* Nested lists */
+	div :global(ul ul) {
+		list-style-type: circle;
+		margin-top: 0.25rem;
+		margin-bottom: 0.25rem;
+	}
+
+	div :global(ol ol) {
+		list-style-type: lower-alpha;
+		margin-top: 0.25rem;
+		margin-bottom: 0.25rem;
+	}
+
+	/* Task lists */
+	div :global(.task-list-item) {
+		list-style: none;
+		margin-left: 0;
+		padding-left: 0;
+	}
+
+	div :global(.task-list-item-checkbox) {
+		margin-right: 0.5rem;
+		margin-top: 0.125rem;
+	}
+
+	/* Blockquotes */
+	div :global(blockquote) {
+		border-left: 4px solid var(--border);
+		padding: 0.5rem 1rem;
+		margin: 1.5rem 0;
+		font-style: italic;
+		color: var(--muted-foreground);
+		background: var(--muted);
+		border-radius: 0 0.375rem 0.375rem 0;
+	}
+
+	/* Tables */
+	div :global(table) {
+		width: 100%;
+		margin: 1.5rem 0;
+		border-collapse: collapse;
+		border: 1px solid var(--border);
+		border-radius: 0.375rem;
+		overflow: hidden;
+	}
+
+	div :global(th) {
+		background: hsl(var(--muted) / 0.3);
+		border: 1px solid var(--border);
+		padding: 0.5rem 0.75rem;
+		text-align: left;
+		font-weight: 600;
+	}
+
+	div :global(td) {
+		border: 1px solid var(--border);
+		padding: 0.5rem 0.75rem;
+	}
+
+	div :global(tr:nth-child(even)) {
+		background: hsl(var(--muted) / 0.1);
+	}
+
+	/* User message markdown should keep table borders visible on light primary backgrounds */
+	div.markdown-user-content :global(table),
+	div.markdown-user-content :global(th),
+	div.markdown-user-content :global(td),
+	div.markdown-user-content :global(.table-wrapper) {
+		border-color: currentColor;
+	}
+
+	/* Horizontal rules */
+	div :global(hr) {
+		border: none;
+		border-top: 1px solid var(--border);
+		margin: 1.5rem 0;
+	}
+
+	/* Images */
+	div :global(img) {
+		border-radius: 0.5rem;
+		box-shadow:
+			0 1px 3px 0 rgb(0 0 0 / 0.1),
+			0 1px 2px -1px rgb(0 0 0 / 0.1);
+		margin: 1.5rem 0;
+		max-width: 100%;
+		height: auto;
+	}
+
+	/* Code blocks */
+
+	div :global(.code-block-wrapper) {
+		margin: 1.5rem 0;
+		border-radius: 0.75rem;
+		overflow: hidden;
+		border: 1px solid var(--border);
+		background: var(--code-background);
+	}
+
+	div :global(.code-block-header) {
+		display: flex;
+		justify-content: space-between;
+		align-items: center;
+		padding: 0.5rem 1rem;
+		background: hsl(var(--muted) / 0.5);
+		border-bottom: 1px solid var(--border);
+		font-size: 0.875rem;
+	}
+
+	div :global(.code-language) {
+		color: var(--code-foreground);
+		font-weight: 500;
+		font-family:
+			ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas,
+			'Liberation Mono', Menlo, monospace;
+		text-transform: uppercase;
+		font-size: 0.75rem;
+		letter-spacing: 0.05em;
+	}
+
+	div :global(.code-block-actions) {
+		display: flex;
+		align-items: center;
+		gap: 0.5rem;
+	}
+
+	div :global(.copy-code-btn),
+	div :global(.preview-code-btn) {
+		display: flex;
+		align-items: center;
+		justify-content: center;
+		padding: 0;
+		background: transparent;
+		color: var(--code-foreground);
+		cursor: pointer;
+		transition: all 0.2s ease;
+	}
+
+	div :global(.copy-code-btn:hover),
+	div :global(.preview-code-btn:hover) {
+		transform: scale(1.05);
+	}
+
+	div :global(.copy-code-btn:active),
+	div :global(.preview-code-btn:active) {
+		transform: scale(0.95);
+	}
+
+	div :global(.code-block-wrapper pre) {
+		background: transparent;
+		padding: 1rem;
+		margin: 0;
+		overflow-x: auto;
+		border-radius: 0;
+		border: none;
+		font-size: 0.875rem;
+		line-height: 1.5;
+	}
+
+	div :global(pre) {
+		background: var(--muted);
+		margin: 1.5rem 0;
+		overflow-x: auto;
+		border-radius: 1rem;
+		border: none;
+	}
+
+	div :global(code) {
+		background: transparent;
+		color: var(--code-foreground);
+	}
+
+	/* Mentions and hashtags */
+	div :global(.mention) {
+		color: hsl(var(--primary));
+		font-weight: 500;
+		text-decoration: none;
+	}
+
+	div :global(.mention:hover) {
+		text-decoration: underline;
+	}
+
+	div :global(.hashtag) {
+		color: hsl(var(--primary));
+		font-weight: 500;
+		text-decoration: none;
+	}
+
+	div :global(.hashtag:hover) {
+		text-decoration: underline;
+	}
+
+	/* Advanced table enhancements */
+	div :global(table) {
+		transition: all 0.2s ease;
+	}
+
+	div :global(table:hover) {
+		box-shadow:
+			0 4px 6px -1px rgb(0 0 0 / 0.1),
+			0 2px 4px -2px rgb(0 0 0 / 0.1);
+	}
+
+	div :global(th:hover),
+	div :global(td:hover) {
+		background: var(--muted);
+	}
+
+	/* Disable hover effects when rendering user messages */
+	.markdown-user-content :global(a),
+	.markdown-user-content :global(a:hover) {
+		color: var(--primary-foreground);
+	}
+
+	.markdown-user-content :global(table:hover) {
+		box-shadow: none;
+	}
+
+	.markdown-user-content :global(th:hover),
+	.markdown-user-content :global(td:hover) {
+		background: inherit;
+	}
+
+	/* Enhanced blockquotes */
+	div :global(blockquote) {
+		transition: all 0.2s ease;
+		position: relative;
+	}
+
+	div :global(blockquote:hover) {
+		border-left-width: 6px;
+		background: var(--muted);
+		transform: translateX(2px);
+	}
+
+	div :global(blockquote::before) {
+		content: '"';
+		position: absolute;
+		top: -0.5rem;
+		left: 0.5rem;
+		font-size: 3rem;
+		color: var(--muted-foreground);
+		font-family: serif;
+		line-height: 1;
+	}
+
+	/* Enhanced images */
+	div :global(img) {
+		transition: all 0.3s ease;
+		cursor: pointer;
+	}
+
+	div :global(img:hover) {
+		transform: scale(1.02);
+		box-shadow:
+			0 10px 15px -3px rgb(0 0 0 / 0.1),
+			0 4px 6px -4px rgb(0 0 0 / 0.1);
+	}
+
+	/* Image zoom overlay */
+	div :global(.image-zoom-overlay) {
+		position: fixed;
+		top: 0;
+		left: 0;
+		right: 0;
+		bottom: 0;
+		background: rgba(0, 0, 0, 0.8);
+		display: flex;
+		align-items: center;
+		justify-content: center;
+		z-index: 1000;
+		cursor: pointer;
+	}
+
+	div :global(.image-zoom-overlay img) {
+		max-width: 90vw;
+		max-height: 90vh;
+		border-radius: 0.5rem;
+		box-shadow: 0 25px 50px -12px rgb(0 0 0 / 0.25);
+	}
+
+	/* Enhanced horizontal rules */
+	div :global(hr) {
+		border: none;
+		height: 2px;
+		background: linear-gradient(to right, transparent, var(--border), transparent);
+		margin: 2rem 0;
+		position: relative;
+	}
+
+	div :global(hr::after) {
+		content: '';
+		position: absolute;
+		top: 50%;
+		left: 50%;
+		transform: translate(-50%, -50%);
+		width: 1rem;
+		height: 1rem;
+		background: var(--border);
+		border-radius: 50%;
+	}
+
+	/* Scrollable tables */
+	div :global(.table-wrapper) {
+		overflow-x: auto;
+		margin: 1.5rem 0;
+		border-radius: 0.5rem;
+		border: 1px solid var(--border);
+	}
+
+	div :global(.table-wrapper table) {
+		margin: 0;
+		border: none;
+	}
+
+	/* Responsive adjustments */
+	@media (max-width: 640px) {
+		div :global(h1) {
+			font-size: 1.5rem;
+		}
+
+		div :global(h2) {
+			font-size: 1.25rem;
+		}
+
+		div :global(h3) {
+			font-size: 1.125rem;
+		}
+
+		div :global(table) {
+			font-size: 0.875rem;
+		}
+
+		div :global(th),
+		div :global(td) {
+			padding: 0.375rem 0.5rem;
+		}
+
+		div :global(.table-wrapper) {
+			margin: 0.5rem -1rem;
+			border-radius: 0;
+			border-left: none;
+			border-right: none;
+		}
+	}
+
+	/* Dark mode adjustments */
+	@media (prefers-color-scheme: dark) {
+		div :global(blockquote:hover) {
+			background: var(--muted);
+		}
+	}
+</style>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/RemoveButton.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/RemoveButton.svelte
new file mode 100644
index 0000000..1736855
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/RemoveButton.svelte
@@ -0,0 +1,26 @@
+<script lang="ts">
+	import { X } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+
+	interface Props {
+		id: string;
+		onRemove?: (id: string) => void;
+		class?: string;
+	}
+
+	let { id, onRemove, class: className = '' }: Props = $props();
+</script>
+
+<Button
+	type="button"
+	variant="ghost"
+	size="sm"
+	class="h-6 w-6 bg-white/20 p-0 hover:bg-white/30 {className}"
+	onclick={(e) => {
+		e.stopPropagation();
+		onRemove?.(id);
+	}}
+	aria-label="Remove file"
+>
+	<X class="h-3 w-3" />
+</Button>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte
new file mode 100644
index 0000000..15cd6ab
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte
@@ -0,0 +1,73 @@
+<script lang="ts">
+	import { Input } from '$lib/components/ui/input';
+	import { Search, X } from '@lucide/svelte';
+
+	interface Props {
+		value?: string;
+		placeholder?: string;
+		onInput?: (value: string) => void;
+		onClose?: () => void;
+		onKeyDown?: (event: KeyboardEvent) => void;
+		class?: string;
+		id?: string;
+		ref?: HTMLInputElement | null;
+	}
+
+	let {
+		value = $bindable(''),
+		placeholder = 'Search...',
+		onInput,
+		onClose,
+		onKeyDown,
+		class: className,
+		id,
+		ref = $bindable(null)
+	}: Props = $props();
+
+	let showClearButton = $derived(!!value || !!onClose);
+
+	function handleInput(event: Event) {
+		const target = event.target as HTMLInputElement;
+
+		value = target.value;
+		onInput?.(target.value);
+	}
+
+	function handleClear() {
+		if (value) {
+			value = '';
+			onInput?.('');
+			ref?.focus();
+		} else {
+			onClose?.();
+		}
+	}
+</script>
+
+<div class="relative {className}">
+	<Search
+		class="absolute top-1/2 left-3 h-4 w-4 -translate-y-1/2 transform text-muted-foreground"
+	/>
+
+	<Input
+		{id}
+		bind:value
+		bind:ref
+		class="pl-9 {showClearButton ? 'pr-9' : ''}"
+		oninput={handleInput}
+		onkeydown={onKeyDown}
+		{placeholder}
+		type="search"
+	/>
+
+	{#if showClearButton}
+		<button
+			type="button"
+			class="absolute top-1/2 right-3 -translate-y-1/2 transform text-muted-foreground transition-colors hover:text-foreground"
+			onclick={handleClear}
+			aria-label={value ? 'Clear search' : 'Close'}
+		>
+			<X class="h-4 w-4" />
+		</button>
+	{/if}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/misc/SyntaxHighlightedCode.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/misc/SyntaxHighlightedCode.svelte
new file mode 100644
index 0000000..bc42f9d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/misc/SyntaxHighlightedCode.svelte
@@ -0,0 +1,97 @@
+<script lang="ts">
+	import hljs from 'highlight.js';
+	import { browser } from '$app/environment';
+	import { mode } from 'mode-watcher';
+
+	import githubDarkCss from 'highlight.js/styles/github-dark.css?inline';
+	import githubLightCss from 'highlight.js/styles/github.css?inline';
+
+	interface Props {
+		code: string;
+		language?: string;
+		class?: string;
+		maxHeight?: string;
+		maxWidth?: string;
+	}
+
+	let {
+		code,
+		language = 'text',
+		class: className = '',
+		maxHeight = '60vh',
+		maxWidth = ''
+	}: Props = $props();
+
+	let highlightedHtml = $state('');
+
+	function loadHighlightTheme(isDark: boolean) {
+		if (!browser) return;
+
+		const existingThemes = document.querySelectorAll('style[data-highlight-theme-preview]');
+		existingThemes.forEach((style) => style.remove());
+
+		const style = document.createElement('style');
+		style.setAttribute('data-highlight-theme-preview', 'true');
+		style.textContent = isDark ? githubDarkCss : githubLightCss;
+
+		document.head.appendChild(style);
+	}
+
+	$effect(() => {
+		const currentMode = mode.current;
+		const isDark = currentMode === 'dark';
+
+		loadHighlightTheme(isDark);
+	});
+
+	$effect(() => {
+		if (!code) {
+			highlightedHtml = '';
+			return;
+		}
+
+		try {
+			// Check if the language is supported
+			const lang = language.toLowerCase();
+			const isSupported = hljs.getLanguage(lang);
+
+			if (isSupported) {
+				const result = hljs.highlight(code, { language: lang });
+				highlightedHtml = result.value;
+			} else {
+				// Try auto-detection or fallback to plain text
+				const result = hljs.highlightAuto(code);
+				highlightedHtml = result.value;
+			}
+		} catch {
+			// Fallback to escaped plain text
+			highlightedHtml = code.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
+		}
+	});
+</script>
+
+<div
+	class="code-preview-wrapper overflow-auto rounded-lg border border-border bg-muted {className}"
+	style="max-height: {maxHeight}; max-width: {maxWidth};"
+>
+	<!-- Needs to be formatted as single line for proper rendering -->
+	<pre class="m-0 overflow-x-auto p-4"><code class="hljs text-sm leading-relaxed"
+			>{@html highlightedHtml}</code
+		></pre>
+</div>
+
+<style>
+	.code-preview-wrapper {
+		font-family:
+			ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas,
+			'Liberation Mono', Menlo, monospace;
+	}
+
+	.code-preview-wrapper pre {
+		background: transparent;
+	}
+
+	.code-preview-wrapper code {
+		background: transparent;
+	}
+</style>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte
new file mode 100644
index 0000000..bea1bf6
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte
@@ -0,0 +1,56 @@
+<script lang="ts">
+	import { Package } from '@lucide/svelte';
+	import { BadgeInfo, CopyToClipboardIcon } from '$lib/components/app';
+	import { modelsStore } from '$lib/stores/models.svelte';
+	import { serverStore } from '$lib/stores/server.svelte';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+
+	interface Props {
+		class?: string;
+		model?: string;
+		onclick?: () => void;
+		showCopyIcon?: boolean;
+		showTooltip?: boolean;
+	}
+
+	let {
+		class: className = '',
+		model: modelProp,
+		onclick,
+		showCopyIcon = false,
+		showTooltip = false
+	}: Props = $props();
+
+	let model = $derived(modelProp || modelsStore.singleModelName);
+	let isModelMode = $derived(serverStore.isModelMode);
+</script>
+
+{#snippet badgeContent()}
+	<BadgeInfo class={className} {onclick}>
+		{#snippet icon()}
+			<Package class="h-3 w-3" />
+		{/snippet}
+
+		{model}
+
+		{#if showCopyIcon}
+			<CopyToClipboardIcon text={model || ''} ariaLabel="Copy model name" />
+		{/if}
+	</BadgeInfo>
+{/snippet}
+
+{#if model && isModelMode}
+	{#if showTooltip}
+		<Tooltip.Root>
+			<Tooltip.Trigger>
+				{@render badgeContent()}
+			</Tooltip.Trigger>
+
+			<Tooltip.Content>
+				{onclick ? 'Click for model details' : model}
+			</Tooltip.Content>
+		</Tooltip.Root>
+	{:else}
+		{@render badgeContent()}
+	{/if}
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
new file mode 100644
index 0000000..efc9cd4
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte
@@ -0,0 +1,555 @@
+<script lang="ts">
+	import { onMount, tick } from 'svelte';
+	import { ChevronDown, EyeOff, Loader2, MicOff, Package, Power } from '@lucide/svelte';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import * as Popover from '$lib/components/ui/popover';
+	import { cn } from '$lib/components/ui/utils';
+	import {
+		modelsStore,
+		modelOptions,
+		modelsLoading,
+		modelsUpdating,
+		selectedModelId,
+		routerModels,
+		propsCacheVersion,
+		singleModelName
+	} from '$lib/stores/models.svelte';
+	import { usedModalities, conversationsStore } from '$lib/stores/conversations.svelte';
+	import { ServerModelStatus } from '$lib/enums';
+	import { isRouterMode } from '$lib/stores/server.svelte';
+	import { DialogModelInformation, SearchInput } from '$lib/components/app';
+	import type { ModelOption } from '$lib/types/models';
+
+	interface Props {
+		class?: string;
+		currentModel?: string | null;
+		/** Callback when model changes. Return false to keep menu open (e.g., for validation failures) */
+		onModelChange?: (modelId: string, modelName: string) => Promise<boolean> | boolean | void;
+		disabled?: boolean;
+		forceForegroundText?: boolean;
+		/** When true, user's global selection takes priority over currentModel (for form selector) */
+		useGlobalSelection?: boolean;
+		/**
+		 * When provided, only consider modalities from messages BEFORE this message.
+		 * Used for regeneration - allows selecting models that don't support modalities
+		 * used in later messages.
+		 */
+		upToMessageId?: string;
+	}
+
+	let {
+		class: className = '',
+		currentModel = null,
+		onModelChange,
+		disabled = false,
+		forceForegroundText = false,
+		useGlobalSelection = false,
+		upToMessageId
+	}: Props = $props();
+
+	let options = $derived(modelOptions());
+	let loading = $derived(modelsLoading());
+	let updating = $derived(modelsUpdating());
+	let activeId = $derived(selectedModelId());
+	let isRouter = $derived(isRouterMode());
+	let serverModel = $derived(singleModelName());
+
+	// Reactive router models state - needed for proper reactivity of status checks
+	let currentRouterModels = $derived(routerModels());
+
+	let requiredModalities = $derived(
+		upToMessageId ? conversationsStore.getModalitiesUpToMessage(upToMessageId) : usedModalities()
+	);
+
+	function getModelStatus(modelId: string): ServerModelStatus | null {
+		const model = currentRouterModels.find((m) => m.id === modelId);
+		return (model?.status?.value as ServerModelStatus) ?? null;
+	}
+
+	/**
+	 * Checks if a model supports all modalities used in the conversation.
+	 * Returns true if the model can be selected, false if it should be disabled.
+	 */
+	function isModelCompatible(option: ModelOption): boolean {
+		void propsCacheVersion();
+
+		const modelModalities = modelsStore.getModelModalities(option.model);
+
+		if (!modelModalities) {
+			const status = getModelStatus(option.model);
+
+			if (status === ServerModelStatus.LOADED) {
+				if (requiredModalities.vision || requiredModalities.audio) return false;
+			}
+
+			return true;
+		}
+
+		if (requiredModalities.vision && !modelModalities.vision) return false;
+		if (requiredModalities.audio && !modelModalities.audio) return false;
+
+		return true;
+	}
+
+	/**
+	 * Gets missing modalities for a model.
+	 * Returns object with vision/audio booleans indicating what's missing.
+	 */
+	function getMissingModalities(option: ModelOption): { vision: boolean; audio: boolean } | null {
+		void propsCacheVersion();
+
+		const modelModalities = modelsStore.getModelModalities(option.model);
+
+		if (!modelModalities) {
+			const status = getModelStatus(option.model);
+
+			if (status === ServerModelStatus.LOADED) {
+				const missing = {
+					vision: requiredModalities.vision,
+					audio: requiredModalities.audio
+				};
+
+				if (missing.vision || missing.audio) return missing;
+			}
+
+			return null;
+		}
+
+		const missing = {
+			vision: requiredModalities.vision && !modelModalities.vision,
+			audio: requiredModalities.audio && !modelModalities.audio
+		};
+
+		if (!missing.vision && !missing.audio) return null;
+
+		return missing;
+	}
+
+	let isHighlightedCurrentModelActive = $derived(
+		!isRouter || !currentModel
+			? false
+			: (() => {
+					const currentOption = options.find((option) => option.model === currentModel);
+
+					return currentOption ? currentOption.id === activeId : false;
+				})()
+	);
+
+	let isCurrentModelInCache = $derived(() => {
+		if (!isRouter || !currentModel) return true;
+
+		return options.some((option) => option.model === currentModel);
+	});
+
+	let searchTerm = $state('');
+	let searchInputRef = $state<HTMLInputElement | null>(null);
+	let highlightedIndex = $state<number>(-1);
+
+	let filteredOptions: ModelOption[] = $derived(
+		(() => {
+			const term = searchTerm.trim().toLowerCase();
+			if (!term) return options;
+
+			return options.filter(
+				(option) =>
+					option.model.toLowerCase().includes(term) || option.name?.toLowerCase().includes(term)
+			);
+		})()
+	);
+
+	// Get indices of compatible options for keyboard navigation
+	let compatibleIndices = $derived(
+		filteredOptions
+			.map((option, index) => (isModelCompatible(option) ? index : -1))
+			.filter((i) => i !== -1)
+	);
+
+	// Reset highlighted index when search term changes
+	$effect(() => {
+		void searchTerm;
+		highlightedIndex = -1;
+	});
+
+	let isOpen = $state(false);
+	let showModelDialog = $state(false);
+
+	onMount(() => {
+		modelsStore.fetch().catch((error) => {
+			console.error('Unable to load models:', error);
+		});
+	});
+
+	// Handle changes to the model selector pop-down or the model dialog, depending on if the server is in
+	// router mode or not.
+	function handleOpenChange(open: boolean) {
+		if (loading || updating) return;
+
+		if (isRouter) {
+			if (open) {
+				isOpen = true;
+				searchTerm = '';
+				highlightedIndex = -1;
+
+				// Focus search input after popover opens
+				tick().then(() => {
+					requestAnimationFrame(() => searchInputRef?.focus());
+				});
+
+				modelsStore.fetchRouterModels().then(() => {
+					modelsStore.fetchModalitiesForLoadedModels();
+				});
+			} else {
+				isOpen = false;
+				searchTerm = '';
+				highlightedIndex = -1;
+			}
+		} else {
+			showModelDialog = open;
+		}
+	}
+
+	export function open() {
+		handleOpenChange(true);
+	}
+
+	function handleSearchKeyDown(event: KeyboardEvent) {
+		if (event.isComposing) return;
+
+		if (event.key === 'ArrowDown') {
+			event.preventDefault();
+			if (compatibleIndices.length === 0) return;
+
+			const currentPos = compatibleIndices.indexOf(highlightedIndex);
+			if (currentPos === -1 || currentPos === compatibleIndices.length - 1) {
+				highlightedIndex = compatibleIndices[0];
+			} else {
+				highlightedIndex = compatibleIndices[currentPos + 1];
+			}
+		} else if (event.key === 'ArrowUp') {
+			event.preventDefault();
+			if (compatibleIndices.length === 0) return;
+
+			const currentPos = compatibleIndices.indexOf(highlightedIndex);
+			if (currentPos === -1 || currentPos === 0) {
+				highlightedIndex = compatibleIndices[compatibleIndices.length - 1];
+			} else {
+				highlightedIndex = compatibleIndices[currentPos - 1];
+			}
+		} else if (event.key === 'Enter') {
+			event.preventDefault();
+			if (highlightedIndex >= 0 && highlightedIndex < filteredOptions.length) {
+				const option = filteredOptions[highlightedIndex];
+				if (isModelCompatible(option)) {
+					handleSelect(option.id);
+				}
+			} else if (compatibleIndices.length > 0) {
+				// No selection - highlight first compatible option
+				highlightedIndex = compatibleIndices[0];
+			}
+		}
+	}
+
+	async function handleSelect(modelId: string) {
+		const option = options.find((opt) => opt.id === modelId);
+		if (!option) return;
+
+		let shouldCloseMenu = true;
+
+		if (onModelChange) {
+			// If callback provided, use it (for regenerate functionality)
+			const result = await onModelChange(option.id, option.model);
+
+			// If callback returns false, keep menu open (validation failed)
+			if (result === false) {
+				shouldCloseMenu = false;
+			}
+		} else {
+			// Update global selection
+			await modelsStore.selectModelById(option.id);
+
+			// Load the model if not already loaded (router mode)
+			if (isRouter && getModelStatus(option.model) !== ServerModelStatus.LOADED) {
+				try {
+					await modelsStore.loadModel(option.model);
+				} catch (error) {
+					console.error('Failed to load model:', error);
+				}
+			}
+		}
+
+		if (shouldCloseMenu) {
+			handleOpenChange(false);
+
+			// Focus the chat textarea after model selection
+			requestAnimationFrame(() => {
+				const textarea = document.querySelector<HTMLTextAreaElement>(
+					'[data-slot="chat-form"] textarea'
+				);
+				textarea?.focus();
+			});
+		}
+	}
+
+	function getDisplayOption(): ModelOption | undefined {
+		if (!isRouter) {
+			if (serverModel) {
+				return {
+					id: 'current',
+					model: serverModel,
+					name: serverModel.split('/').pop() || serverModel,
+					capabilities: [] // Empty array for single model mode
+				};
+			}
+
+			return undefined;
+		}
+
+		// When useGlobalSelection is true (form selector), prioritize user selection
+		// Otherwise (message display), prioritize currentModel
+		if (useGlobalSelection && activeId) {
+			const selected = options.find((option) => option.id === activeId);
+			if (selected) return selected;
+		}
+
+		// Show currentModel (from message payload or conversation)
+		if (currentModel) {
+			if (!isCurrentModelInCache()) {
+				return {
+					id: 'not-in-cache',
+					model: currentModel,
+					name: currentModel.split('/').pop() || currentModel,
+					capabilities: []
+				};
+			}
+
+			return options.find((option) => option.model === currentModel);
+		}
+
+		// Fallback to user selection (for new chats before first message)
+		if (activeId) {
+			return options.find((option) => option.id === activeId);
+		}
+
+		// No selection - return undefined to show "Select model"
+		return undefined;
+	}
+</script>
+
+<div class={cn('relative inline-flex flex-col items-end gap-1', className)}>
+	{#if loading && options.length === 0 && isRouter}
+		<div class="flex items-center gap-2 text-xs text-muted-foreground">
+			<Loader2 class="h-3.5 w-3.5 animate-spin" />
+			Loading models…
+		</div>
+	{:else if options.length === 0 && isRouter}
+		<p class="text-xs text-muted-foreground">No models available.</p>
+	{:else}
+		{@const selectedOption = getDisplayOption()}
+
+		{#if isRouter}
+			<Popover.Root bind:open={isOpen} onOpenChange={handleOpenChange}>
+				<Popover.Trigger
+					class={cn(
+						`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
+						!isCurrentModelInCache()
+							? 'bg-red-400/10 !text-red-400 hover:bg-red-400/20 hover:text-red-400'
+							: forceForegroundText
+								? 'text-foreground'
+								: isHighlightedCurrentModelActive
+									? 'text-foreground'
+									: 'text-muted-foreground',
+						isOpen ? 'text-foreground' : ''
+					)}
+					style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
+					disabled={disabled || updating}
+				>
+					<Package class="h-3.5 w-3.5" />
+
+					<span class="truncate font-medium">
+						{selectedOption?.model || 'Select model'}
+					</span>
+
+					{#if updating}
+						<Loader2 class="h-3 w-3.5 animate-spin" />
+					{:else}
+						<ChevronDown class="h-3 w-3.5" />
+					{/if}
+				</Popover.Trigger>
+
+				<Popover.Content
+					class="group/popover-content w-96 max-w-[calc(100vw-2rem)] p-0"
+					align="end"
+					sideOffset={8}
+					collisionPadding={16}
+				>
+					<div class="flex max-h-[50dvh] flex-col overflow-hidden">
+						<div
+							class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0"
+						>
+							<SearchInput
+								id="model-search"
+								placeholder="Search models..."
+								bind:value={searchTerm}
+								bind:ref={searchInputRef}
+								onClose={() => handleOpenChange(false)}
+								onKeyDown={handleSearchKeyDown}
+							/>
+						</div>
+						<div
+							class="models-list order-2 min-h-0 flex-1 overflow-y-auto group-data-[side=top]/popover-content:order-1"
+						>
+							{#if !isCurrentModelInCache() && currentModel}
+								<!-- Show unavailable model as first option (disabled) -->
+								<button
+									type="button"
+									class="flex w-full cursor-not-allowed items-center bg-red-400/10 px-4 py-2 text-left text-sm text-red-400"
+									role="option"
+									aria-selected="true"
+									aria-disabled="true"
+									disabled
+								>
+									<span class="truncate">{selectedOption?.name || currentModel}</span>
+									<span class="ml-2 text-xs whitespace-nowrap opacity-70">(not available)</span>
+								</button>
+								<div class="my-1 h-px bg-border"></div>
+							{/if}
+							{#if filteredOptions.length === 0}
+								<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
+							{/if}
+							{#each filteredOptions as option, index (option.id)}
+								{@const status = getModelStatus(option.model)}
+								{@const isLoaded = status === ServerModelStatus.LOADED}
+								{@const isLoading = status === ServerModelStatus.LOADING}
+								{@const isSelected = currentModel === option.model || activeId === option.id}
+								{@const isCompatible = isModelCompatible(option)}
+								{@const isHighlighted = index === highlightedIndex}
+								{@const missingModalities = getMissingModalities(option)}
+
+								<div
+									class={cn(
+										'group flex w-full items-center gap-2 px-4 py-2 text-left text-sm transition focus:outline-none',
+										isCompatible
+											? 'cursor-pointer hover:bg-muted focus:bg-muted'
+											: 'cursor-not-allowed opacity-50',
+										isSelected || isHighlighted
+											? 'bg-accent text-accent-foreground'
+											: isCompatible
+												? 'hover:bg-accent hover:text-accent-foreground'
+												: '',
+										isLoaded ? 'text-popover-foreground' : 'text-muted-foreground'
+									)}
+									role="option"
+									aria-selected={isSelected || isHighlighted}
+									aria-disabled={!isCompatible}
+									tabindex={isCompatible ? 0 : -1}
+									onclick={() => isCompatible && handleSelect(option.id)}
+									onmouseenter={() => (highlightedIndex = index)}
+									onkeydown={(e) => {
+										if (isCompatible && (e.key === 'Enter' || e.key === ' ')) {
+											e.preventDefault();
+											handleSelect(option.id);
+										}
+									}}
+								>
+									<span class="min-w-0 flex-1 truncate">{option.model}</span>
+
+									{#if missingModalities}
+										<span class="flex shrink-0 items-center gap-1 text-muted-foreground/70">
+											{#if missingModalities.vision}
+												<Tooltip.Root>
+													<Tooltip.Trigger>
+														<EyeOff class="h-3.5 w-3.5" />
+													</Tooltip.Trigger>
+													<Tooltip.Content class="z-[9999]">
+														<p>No vision support</p>
+													</Tooltip.Content>
+												</Tooltip.Root>
+											{/if}
+											{#if missingModalities.audio}
+												<Tooltip.Root>
+													<Tooltip.Trigger>
+														<MicOff class="h-3.5 w-3.5" />
+													</Tooltip.Trigger>
+													<Tooltip.Content class="z-[9999]">
+														<p>No audio support</p>
+													</Tooltip.Content>
+												</Tooltip.Root>
+											{/if}
+										</span>
+									{/if}
+
+									{#if isLoading}
+										<Tooltip.Root>
+											<Tooltip.Trigger>
+												<Loader2 class="h-4 w-4 shrink-0 animate-spin text-muted-foreground" />
+											</Tooltip.Trigger>
+											<Tooltip.Content class="z-[9999]">
+												<p>Loading model...</p>
+											</Tooltip.Content>
+										</Tooltip.Root>
+									{:else if isLoaded}
+										<Tooltip.Root>
+											<Tooltip.Trigger>
+												<button
+													type="button"
+													class="relative ml-2 flex h-4 w-4 shrink-0 items-center justify-center"
+													onclick={(e) => {
+														e.stopPropagation();
+														modelsStore.unloadModel(option.model);
+													}}
+												>
+													<span
+														class="mr-2 h-2 w-2 rounded-full bg-green-500 transition-opacity group-hover:opacity-0"
+													></span>
+													<Power
+														class="absolute mr-2 h-4 w-4 text-red-500 opacity-0 transition-opacity group-hover:opacity-100 hover:text-red-600"
+													/>
+												</button>
+											</Tooltip.Trigger>
+											<Tooltip.Content class="z-[9999]">
+												<p>Unload model</p>
+											</Tooltip.Content>
+										</Tooltip.Root>
+									{:else}
+										<span class="mx-2 h-2 w-2 rounded-full bg-muted-foreground/50"></span>
+									{/if}
+								</div>
+							{/each}
+						</div>
+					</div>
+				</Popover.Content>
+			</Popover.Root>
+		{:else}
+			<button
+				class={cn(
+					`inline-flex cursor-pointer items-center gap-1.5 rounded-sm bg-muted-foreground/10 px-1.5 py-1 text-xs transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60`,
+					!isCurrentModelInCache()
+						? 'bg-red-400/10 !text-red-400 hover:bg-red-400/20 hover:text-red-400'
+						: forceForegroundText
+							? 'text-foreground'
+							: isHighlightedCurrentModelActive
+								? 'text-foreground'
+								: 'text-muted-foreground',
+					isOpen ? 'text-foreground' : ''
+				)}
+				style="max-width: min(calc(100cqw - 6.5rem), 32rem)"
+				onclick={() => handleOpenChange(true)}
+				disabled={disabled || updating}
+			>
+				<Package class="h-3.5 w-3.5" />
+
+				<span class="truncate font-medium">
+					{selectedOption?.model}
+				</span>
+
+				{#if updating}
+					<Loader2 class="h-3 w-3.5 animate-spin" />
+				{/if}
+			</button>
+		{/if}
+	{/if}
+</div>
+
+{#if showModelDialog && !isRouter}
+	<DialogModelInformation bind:open={showModelDialog} />
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte
new file mode 100644
index 0000000..fa4c284
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte
@@ -0,0 +1,282 @@
+<script lang="ts">
+	import { base } from '$app/paths';
+	import { AlertTriangle, RefreshCw, Key, CheckCircle, XCircle } from '@lucide/svelte';
+	import { goto } from '$app/navigation';
+	import { Button } from '$lib/components/ui/button';
+	import { Input } from '$lib/components/ui/input';
+	import Label from '$lib/components/ui/label/label.svelte';
+	import { serverStore, serverLoading } from '$lib/stores/server.svelte';
+	import { config, settingsStore } from '$lib/stores/settings.svelte';
+	import { fade, fly, scale } from 'svelte/transition';
+
+	interface Props {
+		class?: string;
+		error: string;
+		onRetry?: () => void;
+		showRetry?: boolean;
+		showTroubleshooting?: boolean;
+	}
+
+	let {
+		class: className = '',
+		error,
+		onRetry,
+		showRetry = true,
+		showTroubleshooting = false
+	}: Props = $props();
+
+	let isServerLoading = $derived(serverLoading());
+	let isAccessDeniedError = $derived(
+		error.toLowerCase().includes('access denied') ||
+			error.toLowerCase().includes('invalid api key') ||
+			error.toLowerCase().includes('unauthorized') ||
+			error.toLowerCase().includes('401') ||
+			error.toLowerCase().includes('403')
+	);
+
+	let apiKeyInput = $state('');
+	let showApiKeyInput = $state(false);
+	let apiKeyState = $state<'idle' | 'validating' | 'success' | 'error'>('idle');
+	let apiKeyError = $state('');
+
+	function handleRetryConnection() {
+		if (onRetry) {
+			onRetry();
+		} else {
+			serverStore.fetch();
+		}
+	}
+
+	function handleShowApiKeyInput() {
+		showApiKeyInput = true;
+		// Pre-fill with current API key if it exists
+		const currentConfig = config();
+		apiKeyInput = currentConfig.apiKey?.toString() || '';
+	}
+
+	async function handleSaveApiKey() {
+		if (!apiKeyInput.trim()) return;
+
+		apiKeyState = 'validating';
+		apiKeyError = '';
+
+		try {
+			// Update the API key in settings first
+			settingsStore.updateConfig('apiKey', apiKeyInput.trim());
+
+			// Test the API key by making a real request to the server
+			const response = await fetch(`${base}/props`, {
+				headers: {
+					'Content-Type': 'application/json',
+					Authorization: `Bearer ${apiKeyInput.trim()}`
+				}
+			});
+
+			if (response.ok) {
+				// API key is valid - User Story B
+				apiKeyState = 'success';
+
+				// Show success state briefly, then navigate to home
+				setTimeout(() => {
+					goto(`#/`);
+				}, 1000);
+			} else {
+				// API key is invalid - User Story A
+				apiKeyState = 'error';
+
+				if (response.status === 401 || response.status === 403) {
+					apiKeyError = 'Invalid API key - please check and try again';
+				} else {
+					apiKeyError = `Authentication failed (${response.status})`;
+				}
+
+				// Reset to idle state after showing error (don't reload UI)
+				setTimeout(() => {
+					apiKeyState = 'idle';
+				}, 3000);
+			}
+		} catch (error) {
+			// Network or other errors - User Story A
+			apiKeyState = 'error';
+
+			if (error instanceof Error) {
+				if (error.message.includes('fetch')) {
+					apiKeyError = 'Cannot connect to server - check if server is running';
+				} else {
+					apiKeyError = error.message;
+				}
+			} else {
+				apiKeyError = 'Connection error - please try again';
+			}
+
+			// Reset to idle state after showing error (don't reload UI)
+			setTimeout(() => {
+				apiKeyState = 'idle';
+			}, 3000);
+		}
+	}
+
+	function handleApiKeyKeydown(event: KeyboardEvent) {
+		if (event.key === 'Enter') {
+			handleSaveApiKey();
+		}
+	}
+</script>
+
+<div class="flex h-full items-center justify-center {className}">
+	<div class="w-full max-w-md px-4 text-center">
+		<div class="mb-6" in:fade={{ duration: 300 }}>
+			<div
+				class="mx-auto mb-4 flex h-16 w-16 items-center justify-center rounded-full bg-destructive/10"
+			>
+				<AlertTriangle class="h-8 w-8 text-destructive" />
+			</div>
+
+			<h2 class="mb-2 text-xl font-semibold">Server Connection Error</h2>
+
+			<p class="mb-4 text-sm text-muted-foreground">
+				{error}
+			</p>
+		</div>
+
+		{#if isAccessDeniedError && !showApiKeyInput}
+			<div in:fly={{ y: 10, duration: 300, delay: 200 }} class="mb-4">
+				<Button onclick={handleShowApiKeyInput} variant="outline" class="w-full">
+					<Key class="h-4 w-4" />
+					Enter API Key
+				</Button>
+			</div>
+		{/if}
+
+		{#if showApiKeyInput}
+			<div in:fly={{ y: 10, duration: 300, delay: 200 }} class="mb-4 space-y-3 text-left">
+				<div class="space-y-2">
+					<Label for="api-key-input" class="text-sm font-medium">API Key</Label>
+
+					<div class="relative">
+						<Input
+							id="api-key-input"
+							placeholder="Enter your API key..."
+							bind:value={apiKeyInput}
+							onkeydown={handleApiKeyKeydown}
+							class="w-full pr-10 {apiKeyState === 'error'
+								? 'border-destructive'
+								: apiKeyState === 'success'
+									? 'border-green-500'
+									: ''}"
+							disabled={apiKeyState === 'validating'}
+						/>
+						{#if apiKeyState === 'validating'}
+							<div class="absolute top-1/2 right-3 -translate-y-1/2">
+								<RefreshCw class="h-4 w-4 animate-spin text-muted-foreground" />
+							</div>
+						{:else if apiKeyState === 'success'}
+							<div
+								class="absolute top-1/2 right-3 -translate-y-1/2"
+								in:scale={{ duration: 200, start: 0.8 }}
+							>
+								<CheckCircle class="h-4 w-4 text-green-500" />
+							</div>
+						{:else if apiKeyState === 'error'}
+							<div
+								class="absolute top-1/2 right-3 -translate-y-1/2"
+								in:scale={{ duration: 200, start: 0.8 }}
+							>
+								<XCircle class="h-4 w-4 text-destructive" />
+							</div>
+						{/if}
+					</div>
+					{#if apiKeyError}
+						<p class="text-sm text-destructive" in:fly={{ y: -10, duration: 200 }}>
+							{apiKeyError}
+						</p>
+					{/if}
+					{#if apiKeyState === 'success'}
+						<p class="text-sm text-green-600" in:fly={{ y: -10, duration: 200 }}>
+							✓ API key validated successfully! Connecting...
+						</p>
+					{/if}
+				</div>
+				<div class="flex gap-2">
+					<Button
+						onclick={handleSaveApiKey}
+						disabled={!apiKeyInput.trim() ||
+							apiKeyState === 'validating' ||
+							apiKeyState === 'success'}
+						class="flex-1"
+					>
+						{#if apiKeyState === 'validating'}
+							<RefreshCw class="h-4 w-4 animate-spin" />
+							Validating...
+						{:else if apiKeyState === 'success'}
+							Success!
+						{:else}
+							Save & Retry
+						{/if}
+					</Button>
+					<Button
+						onclick={() => {
+							showApiKeyInput = false;
+							apiKeyState = 'idle';
+							apiKeyError = '';
+						}}
+						variant="outline"
+						class="flex-1"
+						disabled={apiKeyState === 'validating'}
+					>
+						Cancel
+					</Button>
+				</div>
+			</div>
+		{/if}
+
+		{#if showRetry}
+			<div in:fly={{ y: 10, duration: 300, delay: 200 }}>
+				<Button onclick={handleRetryConnection} disabled={isServerLoading} class="w-full">
+					{#if isServerLoading}
+						<RefreshCw class="h-4 w-4 animate-spin" />
+
+						Connecting...
+					{:else}
+						<RefreshCw class="h-4 w-4" />
+
+						Retry Connection
+					{/if}
+				</Button>
+			</div>
+		{/if}
+
+		{#if showTroubleshooting}
+			<div class="mt-4 text-left" in:fly={{ y: 10, duration: 300, delay: 400 }}>
+				<details class="text-sm">
+					<summary class="cursor-pointer text-muted-foreground hover:text-foreground">
+						Troubleshooting
+					</summary>
+
+					<div class="mt-2 space-y-3 text-xs text-muted-foreground">
+						<div class="space-y-2">
+							<p class="mb-4 font-medium">Start the llama-server:</p>
+
+							<div class="rounded bg-muted/50 px-2 py-1 font-mono text-xs">
+								<p>llama-server -hf ggml-org/gemma-3-4b-it-GGUF</p>
+							</div>
+
+							<p>or</p>
+
+							<div class="rounded bg-muted/50 px-2 py-1 font-mono text-xs">
+								<p class="mt-1">llama-server -m locally-stored-model.gguf</p>
+							</div>
+						</div>
+						<ul class="list-disc space-y-1 pl-4">
+							<li>Check that the server is accessible at the correct URL</li>
+
+							<li>Verify your network connection</li>
+
+							<li>Check server logs for any error messages</li>
+						</ul>
+					</div>
+				</details>
+			</div>
+		{/if}
+	</div>
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerLoadingSplash.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerLoadingSplash.svelte
new file mode 100644
index 0000000..505325d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerLoadingSplash.svelte
@@ -0,0 +1,33 @@
+<script lang="ts">
+	import { Server } from '@lucide/svelte';
+	import { ServerStatus } from '$lib/components/app';
+	import { fade } from 'svelte/transition';
+
+	interface Props {
+		class?: string;
+		message?: string;
+	}
+
+	let { class: className = '', message = 'Initializing connection to llama.cpp server...' }: Props =
+		$props();
+</script>
+
+<div class="flex h-full items-center justify-center {className}">
+	<div class="text-center">
+		<div class="mb-4" in:fade={{ duration: 300 }}>
+			<div class="mx-auto mb-4 flex h-16 w-16 items-center justify-center rounded-full bg-muted">
+				<Server class="h-8 w-8 animate-pulse text-muted-foreground" />
+			</div>
+
+			<h2 class="mb-2 text-xl font-semibold">Connecting to Server</h2>
+
+			<p class="text-sm text-muted-foreground">
+				{message}
+			</p>
+		</div>
+
+		<div class="mt-4">
+			<ServerStatus class="justify-center" />
+		</div>
+	</div>
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerStatus.svelte b/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerStatus.svelte
new file mode 100644
index 0000000..d9f6d4a
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerStatus.svelte
@@ -0,0 +1,65 @@
+<script lang="ts">
+	import { AlertTriangle, Server } from '@lucide/svelte';
+	import { Badge } from '$lib/components/ui/badge';
+	import { Button } from '$lib/components/ui/button';
+	import { serverProps, serverLoading, serverError } from '$lib/stores/server.svelte';
+	import { singleModelName } from '$lib/stores/models.svelte';
+
+	interface Props {
+		class?: string;
+		showActions?: boolean;
+	}
+
+	let { class: className = '', showActions = false }: Props = $props();
+
+	let error = $derived(serverError());
+	let loading = $derived(serverLoading());
+	let model = $derived(singleModelName());
+	let serverData = $derived(serverProps());
+
+	function getStatusColor() {
+		if (loading) return 'bg-yellow-500';
+		if (error) return 'bg-red-500';
+		if (serverData) return 'bg-green-500';
+
+		return 'bg-gray-500';
+	}
+
+	function getStatusText() {
+		if (loading) return 'Connecting...';
+		if (error) return 'Connection Error';
+		if (serverData) return 'Connected';
+
+		return 'Unknown';
+	}
+</script>
+
+<div class="flex items-center space-x-3 {className}">
+	<div class="flex items-center space-x-2">
+		<div class="h-2 w-2 rounded-full {getStatusColor()}"></div>
+
+		<span class="text-sm text-muted-foreground">{getStatusText()}</span>
+	</div>
+
+	{#if serverData && !error}
+		<Badge variant="outline" class="text-xs">
+			<Server class="mr-1 h-3 w-3" />
+
+			{model || 'Unknown Model'}
+		</Badge>
+
+		{#if serverData.default_generation_settings.n_ctx}
+			<Badge variant="secondary" class="text-xs">
+				ctx: {serverData.default_generation_settings.n_ctx.toLocaleString()}
+			</Badge>
+		{/if}
+	{/if}
+
+	{#if showActions && error}
+		<Button variant="outline" size="sm" class="text-destructive">
+			<AlertTriangle class="h-4 w-4" />
+
+			{error}
+		</Button>
+	{/if}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-action.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-action.svelte
new file mode 100644
index 0000000..162107e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-action.svelte
@@ -0,0 +1,18 @@
+<script lang="ts">
+	import { AlertDialog as AlertDialogPrimitive } from 'bits-ui';
+	import { buttonVariants } from '$lib/components/ui/button/index.js';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: AlertDialogPrimitive.ActionProps = $props();
+</script>
+
+<AlertDialogPrimitive.Action
+	bind:ref
+	data-slot="alert-dialog-action"
+	class={cn(buttonVariants(), className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-cancel.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-cancel.svelte
new file mode 100644
index 0000000..6b3f354
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-cancel.svelte
@@ -0,0 +1,18 @@
+<script lang="ts">
+	import { AlertDialog as AlertDialogPrimitive } from 'bits-ui';
+	import { buttonVariants } from '$lib/components/ui/button/index.js';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: AlertDialogPrimitive.CancelProps = $props();
+</script>
+
+<AlertDialogPrimitive.Cancel
+	bind:ref
+	data-slot="alert-dialog-cancel"
+	class={cn(buttonVariants({ variant: 'outline' }), className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-content.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-content.svelte
new file mode 100644
index 0000000..2398dae
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-content.svelte
@@ -0,0 +1,35 @@
+<script lang="ts">
+	import { AlertDialog as AlertDialogPrimitive } from 'bits-ui';
+	import AlertDialogOverlay from './alert-dialog-overlay.svelte';
+	import { cn, type WithoutChild, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		portalProps,
+		...restProps
+	}: WithoutChild<AlertDialogPrimitive.ContentProps> & {
+		portalProps?: WithoutChildrenOrChild<AlertDialogPrimitive.PortalProps>;
+	} = $props();
+</script>
+
+<AlertDialogPrimitive.Portal {...portalProps}>
+	<AlertDialogOverlay />
+	<AlertDialogPrimitive.Content
+		bind:ref
+		data-slot="alert-dialog-content"
+		class={cn(
+			'fixed z-[999999] grid w-full gap-4 border bg-background p-6 shadow-lg duration-200',
+			// Mobile: Bottom sheet behavior
+			'right-0 bottom-0 left-0 max-h-[100dvh] translate-x-0 translate-y-0 overflow-y-auto rounded-t-lg',
+			'data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:slide-out-to-bottom-full',
+			'data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:slide-in-from-bottom-full',
+			// Desktop: Centered dialog behavior
+			'sm:top-[50%] sm:right-auto sm:bottom-auto sm:left-[50%] sm:max-h-[100vh] sm:max-w-lg sm:translate-x-[-50%] sm:translate-y-[-50%] sm:rounded-lg',
+			'sm:data-[state=closed]:slide-out-to-bottom-0 sm:data-[state=closed]:zoom-out-95',
+			'sm:data-[state=open]:slide-in-from-bottom-0 sm:data-[state=open]:zoom-in-95',
+			className
+		)}
+		{...restProps}
+	/>
+</AlertDialogPrimitive.Portal>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-description.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-description.svelte
new file mode 100644
index 0000000..84735d8
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-description.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import { AlertDialog as AlertDialogPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: AlertDialogPrimitive.DescriptionProps = $props();
+</script>
+
+<AlertDialogPrimitive.Description
+	bind:ref
+	data-slot="alert-dialog-description"
+	class={cn('text-sm text-muted-foreground', className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-footer.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-footer.svelte
new file mode 100644
index 0000000..da0f7be
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-footer.svelte
@@ -0,0 +1,23 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="alert-dialog-footer"
+	class={cn(
+		'mt-6 flex flex-row gap-2 sm:mt-0 sm:justify-end [&>*]:flex-1 sm:[&>*]:flex-none',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-header.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-header.svelte
new file mode 100644
index 0000000..fa6539d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-header.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="alert-dialog-header"
+	class={cn('flex flex-col gap-2 text-center sm:text-left', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-overlay.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-overlay.svelte
new file mode 100644
index 0000000..71f166d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-overlay.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { AlertDialog as AlertDialogPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: AlertDialogPrimitive.OverlayProps = $props();
+</script>
+
+<AlertDialogPrimitive.Overlay
+	bind:ref
+	data-slot="alert-dialog-overlay"
+	class={cn(
+		'fixed inset-0 z-50 bg-black/50 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:animate-in data-[state=open]:fade-in-0',
+		className
+	)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-title.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-title.svelte
new file mode 100644
index 0000000..4c610aa
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-title.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import { AlertDialog as AlertDialogPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: AlertDialogPrimitive.TitleProps = $props();
+</script>
+
+<AlertDialogPrimitive.Title
+	bind:ref
+	data-slot="alert-dialog-title"
+	class={cn('text-lg font-semibold', className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-trigger.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-trigger.svelte
new file mode 100644
index 0000000..51a3da1
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-trigger.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { AlertDialog as AlertDialogPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: AlertDialogPrimitive.TriggerProps = $props();
+</script>
+
+<AlertDialogPrimitive.Trigger bind:ref data-slot="alert-dialog-trigger" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/index.ts
new file mode 100644
index 0000000..a4439bc
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/index.ts
@@ -0,0 +1,39 @@
+import { AlertDialog as AlertDialogPrimitive } from 'bits-ui';
+import Trigger from './alert-dialog-trigger.svelte';
+import Title from './alert-dialog-title.svelte';
+import Action from './alert-dialog-action.svelte';
+import Cancel from './alert-dialog-cancel.svelte';
+import Footer from './alert-dialog-footer.svelte';
+import Header from './alert-dialog-header.svelte';
+import Overlay from './alert-dialog-overlay.svelte';
+import Content from './alert-dialog-content.svelte';
+import Description from './alert-dialog-description.svelte';
+
+const Root = AlertDialogPrimitive.Root;
+const Portal = AlertDialogPrimitive.Portal;
+
+export {
+	Root,
+	Title,
+	Action,
+	Cancel,
+	Portal,
+	Footer,
+	Header,
+	Trigger,
+	Overlay,
+	Content,
+	Description,
+	//
+	Root as AlertDialog,
+	Title as AlertDialogTitle,
+	Action as AlertDialogAction,
+	Cancel as AlertDialogCancel,
+	Portal as AlertDialogPortal,
+	Footer as AlertDialogFooter,
+	Header as AlertDialogHeader,
+	Trigger as AlertDialogTrigger,
+	Overlay as AlertDialogOverlay,
+	Content as AlertDialogContent,
+	Description as AlertDialogDescription
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert-description.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert-description.svelte
new file mode 100644
index 0000000..440d006
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert-description.svelte
@@ -0,0 +1,23 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="alert-description"
+	class={cn(
+		'col-start-2 grid justify-items-start gap-1 text-sm text-muted-foreground [&_p]:leading-relaxed',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert-title.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert-title.svelte
new file mode 100644
index 0000000..0721aeb
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert-title.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="alert-title"
+	class={cn('col-start-2 line-clamp-1 min-h-4 font-medium tracking-tight', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert.svelte
new file mode 100644
index 0000000..7d79e4b
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert.svelte
@@ -0,0 +1,44 @@
+<script lang="ts" module>
+	import { type VariantProps, tv } from 'tailwind-variants';
+
+	export const alertVariants = tv({
+		base: 'relative grid w-full grid-cols-[0_1fr] items-start gap-y-0.5 rounded-lg border px-4 py-3 text-sm has-[>svg]:grid-cols-[calc(var(--spacing)*4)_1fr] has-[>svg]:gap-x-3 [&>svg]:size-4 [&>svg]:translate-y-0.5 [&>svg]:text-current',
+		variants: {
+			variant: {
+				default: 'bg-card text-card-foreground',
+				destructive:
+					'text-destructive bg-card *:data-[slot=alert-description]:text-destructive/90 [&>svg]:text-current'
+			}
+		},
+		defaultVariants: {
+			variant: 'default'
+		}
+	});
+
+	export type AlertVariant = VariantProps<typeof alertVariants>['variant'];
+</script>
+
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		variant = 'default',
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> & {
+		variant?: AlertVariant;
+	} = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="alert"
+	class={cn(alertVariants({ variant }), className)}
+	{...restProps}
+	role="alert"
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/alert/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/alert/index.ts
new file mode 100644
index 0000000..5e0f854
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/alert/index.ts
@@ -0,0 +1,14 @@
+import Root from './alert.svelte';
+import Description from './alert-description.svelte';
+import Title from './alert-title.svelte';
+export { alertVariants, type AlertVariant } from './alert.svelte';
+
+export {
+	Root,
+	Description,
+	Title,
+	//
+	Root as Alert,
+	Description as AlertDescription,
+	Title as AlertTitle
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/badge/badge.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/badge/badge.svelte
new file mode 100644
index 0000000..4d15145
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/badge/badge.svelte
@@ -0,0 +1,49 @@
+<script lang="ts" module>
+	import { type VariantProps, tv } from 'tailwind-variants';
+
+	export const badgeVariants = tv({
+		base: 'focus-visible:border-ring focus-visible:ring-ring/50 aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive inline-flex w-fit shrink-0 items-center justify-center gap-1 overflow-hidden whitespace-nowrap rounded-md border px-2 py-0.5 text-xs font-medium transition-[color,box-shadow] focus-visible:ring-[3px] [&>svg]:pointer-events-none [&>svg]:size-3',
+		variants: {
+			variant: {
+				default: 'bg-primary text-primary-foreground [a&]:hover:bg-primary/90 border-transparent',
+				secondary:
+					'bg-secondary text-secondary-foreground [a&]:hover:bg-secondary/90 border-transparent',
+				destructive:
+					'bg-destructive [a&]:hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/70 border-transparent text-white',
+				outline: 'text-foreground [a&]:hover:bg-accent [a&]:hover:text-accent-foreground'
+			}
+		},
+		defaultVariants: {
+			variant: 'default'
+		}
+	});
+
+	export type BadgeVariant = VariantProps<typeof badgeVariants>['variant'];
+</script>
+
+<script lang="ts">
+	import type { HTMLAnchorAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+
+	let {
+		ref = $bindable(null),
+		href,
+		class: className,
+		variant = 'default',
+		children,
+		...restProps
+	}: WithElementRef<HTMLAnchorAttributes> & {
+		variant?: BadgeVariant;
+	} = $props();
+</script>
+
+<svelte:element
+	this={href ? 'a' : 'span'}
+	bind:this={ref}
+	data-slot="badge"
+	{href}
+	class={cn(badgeVariants({ variant }), className)}
+	{...restProps}
+>
+	{@render children?.()}
+</svelte:element>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/badge/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/badge/index.ts
new file mode 100644
index 0000000..f05fb87
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/badge/index.ts
@@ -0,0 +1,2 @@
+export { default as Badge } from './badge.svelte';
+export { badgeVariants, type BadgeVariant } from './badge.svelte';
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/button/button.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/button/button.svelte
new file mode 100644
index 0000000..d12c8de
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/button/button.svelte
@@ -0,0 +1,87 @@
+<script lang="ts" module>
+	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+	import type { HTMLAnchorAttributes, HTMLButtonAttributes } from 'svelte/elements';
+	import { type VariantProps, tv } from 'tailwind-variants';
+
+	export const buttonVariants = tv({
+		base: "focus-visible:border-ring focus-visible:ring-ring/50 aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive inline-flex shrink-0 items-center justify-center gap-2 whitespace-nowrap rounded-md text-sm font-medium outline-none transition-all focus-visible:ring-[3px] disabled:pointer-events-none disabled:opacity-50 aria-disabled:pointer-events-none aria-disabled:opacity-50 [&_svg:not([class*='size-'])]:size-4 [&_svg]:pointer-events-none [&_svg]:shrink-0",
+		variants: {
+			variant: {
+				default: 'bg-primary text-primary-foreground shadow-xs hover:bg-primary/90',
+				destructive:
+					'bg-destructive shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60 text-white',
+				outline:
+					'bg-background shadow-xs hover:bg-accent hover:text-accent-foreground dark:bg-input/30 dark:border-input dark:hover:bg-input/50 border',
+				secondary: 'bg-secondary text-secondary-foreground shadow-xs hover:bg-secondary/80',
+				ghost: 'hover:bg-accent hover:text-accent-foreground dark:hover:bg-accent/50',
+				link: 'text-primary underline-offset-4 hover:underline'
+			},
+			size: {
+				default: 'h-9 px-4 py-2 has-[>svg]:px-3',
+				sm: 'h-8 gap-1.5 rounded-md px-3 has-[>svg]:px-2.5',
+				lg: 'h-10 rounded-md px-6 has-[>svg]:px-4',
+				icon: 'size-9'
+			}
+		},
+		defaultVariants: {
+			variant: 'default',
+			size: 'default'
+		}
+	});
+
+	export type ButtonVariant = VariantProps<typeof buttonVariants>['variant'];
+	export type ButtonSize = VariantProps<typeof buttonVariants>['size'];
+
+	export type ButtonProps = WithElementRef<HTMLButtonAttributes> &
+		WithElementRef<HTMLAnchorAttributes> & {
+			variant?: ButtonVariant;
+			size?: ButtonSize;
+		};
+</script>
+
+<script lang="ts">
+	let {
+		class: className,
+		variant = 'default',
+		size = 'default',
+		ref = $bindable(null),
+		href = undefined,
+		type = 'button',
+		disabled,
+		children,
+		...restProps
+	}: ButtonProps = $props();
+</script>
+
+{#if href}
+	<a
+		bind:this={ref}
+		data-slot="button"
+		class={cn(buttonVariants({ variant, size }), className)}
+		href={disabled ? undefined : href}
+		aria-disabled={disabled}
+		role={disabled ? 'link' : undefined}
+		tabindex={disabled ? -1 : undefined}
+		{...restProps}
+	>
+		{@render children?.()}
+	</a>
+{:else}
+	<button
+		bind:this={ref}
+		data-slot="button"
+		class={cn(buttonVariants({ variant, size }), className)}
+		{type}
+		{disabled}
+		{...restProps}
+	>
+		{@render children?.()}
+	</button>
+{/if}
+
+<style>
+	a,
+	button {
+		cursor: pointer;
+	}
+</style>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/button/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/button/index.ts
new file mode 100644
index 0000000..5414d9d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/button/index.ts
@@ -0,0 +1,17 @@
+import Root, {
+	type ButtonProps,
+	type ButtonSize,
+	type ButtonVariant,
+	buttonVariants
+} from './button.svelte';
+
+export {
+	Root,
+	type ButtonProps as Props,
+	//
+	Root as Button,
+	buttonVariants,
+	type ButtonProps,
+	type ButtonSize,
+	type ButtonVariant
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-action.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-action.svelte
new file mode 100644
index 0000000..0d4e965
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-action.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="card-action"
+	class={cn('col-start-2 row-span-2 row-start-1 self-start justify-self-end', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-content.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-content.svelte
new file mode 100644
index 0000000..c68f613
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-content.svelte
@@ -0,0 +1,15 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div bind:this={ref} data-slot="card-content" class={cn('px-6', className)} {...restProps}>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-description.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-description.svelte
new file mode 100644
index 0000000..81578df
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-description.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLParagraphElement>> = $props();
+</script>
+
+<p
+	bind:this={ref}
+	data-slot="card-description"
+	class={cn('text-sm text-muted-foreground', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</p>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-footer.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-footer.svelte
new file mode 100644
index 0000000..0366459
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-footer.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="card-footer"
+	class={cn('flex items-center px-6 [.border-t]:pt-6', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-header.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-header.svelte
new file mode 100644
index 0000000..74ab163
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-header.svelte
@@ -0,0 +1,23 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="card-header"
+	class={cn(
+		'@container/card-header grid auto-rows-min grid-rows-[auto_auto] items-start gap-1.5 px-6 has-data-[slot=card-action]:grid-cols-[1fr_auto] [.border-b]:pb-6',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-title.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-title.svelte
new file mode 100644
index 0000000..8dfc062
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card-title.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="card-title"
+	class={cn('leading-none font-semibold', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/card/card.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card.svelte
new file mode 100644
index 0000000..c40d143
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/card/card.svelte
@@ -0,0 +1,23 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="card"
+	class={cn(
+		'flex flex-col gap-6 rounded-xl border bg-card py-6 text-card-foreground shadow-sm',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/card/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/card/index.ts
new file mode 100644
index 0000000..77d3674
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/card/index.ts
@@ -0,0 +1,25 @@
+import Root from './card.svelte';
+import Content from './card-content.svelte';
+import Description from './card-description.svelte';
+import Footer from './card-footer.svelte';
+import Header from './card-header.svelte';
+import Title from './card-title.svelte';
+import Action from './card-action.svelte';
+
+export {
+	Root,
+	Content,
+	Description,
+	Footer,
+	Header,
+	Title,
+	Action,
+	//
+	Root as Card,
+	Content as CardContent,
+	Description as CardDescription,
+	Footer as CardFooter,
+	Header as CardHeader,
+	Title as CardTitle,
+	Action as CardAction
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/checkbox/checkbox.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/checkbox/checkbox.svelte
new file mode 100644
index 0000000..aafa071
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/checkbox/checkbox.svelte
@@ -0,0 +1,36 @@
+<script lang="ts">
+	import { Checkbox as CheckboxPrimitive } from 'bits-ui';
+	import CheckIcon from '@lucide/svelte/icons/check';
+	import MinusIcon from '@lucide/svelte/icons/minus';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		checked = $bindable(false),
+		indeterminate = $bindable(false),
+		class: className,
+		...restProps
+	}: WithoutChildrenOrChild<CheckboxPrimitive.RootProps> = $props();
+</script>
+
+<CheckboxPrimitive.Root
+	bind:ref
+	data-slot="checkbox"
+	class={cn(
+		'peer flex size-4 shrink-0 items-center justify-center rounded-[4px] border border-input shadow-xs transition-shadow outline-none focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50 disabled:cursor-not-allowed disabled:opacity-50 aria-invalid:border-destructive aria-invalid:ring-destructive/20 data-[state=checked]:border-primary data-[state=checked]:bg-primary data-[state=checked]:text-primary-foreground dark:bg-input/30 dark:aria-invalid:ring-destructive/40 dark:data-[state=checked]:bg-primary',
+		className
+	)}
+	bind:checked
+	bind:indeterminate
+	{...restProps}
+>
+	{#snippet children({ checked, indeterminate })}
+		<div data-slot="checkbox-indicator" class="text-current transition-none">
+			{#if checked}
+				<CheckIcon class="size-3.5" />
+			{:else if indeterminate}
+				<MinusIcon class="size-3.5" />
+			{/if}
+		</div>
+	{/snippet}
+</CheckboxPrimitive.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/checkbox/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/checkbox/index.ts
new file mode 100644
index 0000000..5c27671
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/checkbox/index.ts
@@ -0,0 +1,6 @@
+import Root from './checkbox.svelte';
+export {
+	Root,
+	//
+	Root as Checkbox
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible-content.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible-content.svelte
new file mode 100644
index 0000000..59b068c
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible-content.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Collapsible as CollapsiblePrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: CollapsiblePrimitive.ContentProps = $props();
+</script>
+
+<CollapsiblePrimitive.Content bind:ref data-slot="collapsible-content" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible-trigger.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible-trigger.svelte
new file mode 100644
index 0000000..c88ceba
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible-trigger.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Collapsible as CollapsiblePrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: CollapsiblePrimitive.TriggerProps = $props();
+</script>
+
+<CollapsiblePrimitive.Trigger bind:ref data-slot="collapsible-trigger" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible.svelte
new file mode 100644
index 0000000..7a8c5da
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible.svelte
@@ -0,0 +1,11 @@
+<script lang="ts">
+	import { Collapsible as CollapsiblePrimitive } from 'bits-ui';
+
+	let {
+		ref = $bindable(null),
+		open = $bindable(false),
+		...restProps
+	}: CollapsiblePrimitive.RootProps = $props();
+</script>
+
+<CollapsiblePrimitive.Root bind:ref bind:open data-slot="collapsible" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/index.ts
new file mode 100644
index 0000000..8181f64
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/index.ts
@@ -0,0 +1,13 @@
+import Root from './collapsible.svelte';
+import Trigger from './collapsible-trigger.svelte';
+import Content from './collapsible-content.svelte';
+
+export {
+	Root,
+	Content,
+	Trigger,
+	//
+	Root as Collapsible,
+	Content as CollapsibleContent,
+	Trigger as CollapsibleTrigger
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-close.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-close.svelte
new file mode 100644
index 0000000..e8a96a7
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-close.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Dialog as DialogPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: DialogPrimitive.CloseProps = $props();
+</script>
+
+<DialogPrimitive.Close bind:ref data-slot="dialog-close" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-content.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-content.svelte
new file mode 100644
index 0000000..74df0ea
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-content.svelte
@@ -0,0 +1,43 @@
+<script lang="ts">
+	import { Dialog as DialogPrimitive } from 'bits-ui';
+	import XIcon from '@lucide/svelte/icons/x';
+	import type { Snippet } from 'svelte';
+	import * as Dialog from './index.js';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		portalProps,
+		children,
+		showCloseButton = true,
+		...restProps
+	}: WithoutChildrenOrChild<DialogPrimitive.ContentProps> & {
+		portalProps?: DialogPrimitive.PortalProps;
+		children: Snippet;
+		showCloseButton?: boolean;
+	} = $props();
+</script>
+
+<Dialog.Portal {...portalProps}>
+	<Dialog.Overlay />
+	<DialogPrimitive.Content
+		bind:ref
+		data-slot="dialog-content"
+		class={cn(
+			`fixed top-[50%] left-[50%] z-50 grid max-h-[100dvh] w-full max-w-[calc(100%-2rem)] translate-x-[-50%] translate-y-[-50%] gap-4 overflow-y-auto rounded-lg border border-border/30 bg-background p-6 shadow-lg duration-200 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 sm:max-w-lg md:max-h-[100vh]`,
+			className
+		)}
+		{...restProps}
+	>
+		{@render children?.()}
+		{#if showCloseButton}
+			<DialogPrimitive.Close
+				class="absolute top-4 right-4 rounded-xs opacity-70 ring-offset-background transition-opacity hover:opacity-100 focus:ring-2 focus:ring-ring focus:ring-offset-2 focus:outline-hidden disabled:pointer-events-none [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4"
+			>
+				<XIcon />
+				<span class="sr-only">Close</span>
+			</DialogPrimitive.Close>
+		{/if}
+	</DialogPrimitive.Content>
+</Dialog.Portal>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-description.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-description.svelte
new file mode 100644
index 0000000..6c0c192
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-description.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import { Dialog as DialogPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: DialogPrimitive.DescriptionProps = $props();
+</script>
+
+<DialogPrimitive.Description
+	bind:ref
+	data-slot="dialog-description"
+	class={cn('text-sm text-muted-foreground', className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-footer.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-footer.svelte
new file mode 100644
index 0000000..abf948f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-footer.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="dialog-footer"
+	class={cn('flex flex-col-reverse gap-2 sm:flex-row sm:justify-end', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-header.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-header.svelte
new file mode 100644
index 0000000..7ba9ba1
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-header.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="dialog-header"
+	class={cn('flex flex-col gap-2 text-center sm:text-left', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-overlay.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-overlay.svelte
new file mode 100644
index 0000000..a6e9a10
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-overlay.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { Dialog as DialogPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: DialogPrimitive.OverlayProps = $props();
+</script>
+
+<DialogPrimitive.Overlay
+	bind:ref
+	data-slot="dialog-overlay"
+	class={cn(
+		'fixed inset-0 z-50 bg-black/50 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:animate-in data-[state=open]:fade-in-0',
+		className
+	)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-title.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-title.svelte
new file mode 100644
index 0000000..e8c99c5
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-title.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import { Dialog as DialogPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: DialogPrimitive.TitleProps = $props();
+</script>
+
+<DialogPrimitive.Title
+	bind:ref
+	data-slot="dialog-title"
+	class={cn('text-lg leading-none font-semibold', className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-trigger.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-trigger.svelte
new file mode 100644
index 0000000..ac04d9f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-trigger.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Dialog as DialogPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: DialogPrimitive.TriggerProps = $props();
+</script>
+
+<DialogPrimitive.Trigger bind:ref data-slot="dialog-trigger" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/index.ts
new file mode 100644
index 0000000..d9e5fb8
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dialog/index.ts
@@ -0,0 +1,37 @@
+import { Dialog as DialogPrimitive } from 'bits-ui';
+
+import Title from './dialog-title.svelte';
+import Footer from './dialog-footer.svelte';
+import Header from './dialog-header.svelte';
+import Overlay from './dialog-overlay.svelte';
+import Content from './dialog-content.svelte';
+import Description from './dialog-description.svelte';
+import Trigger from './dialog-trigger.svelte';
+import Close from './dialog-close.svelte';
+
+const Root = DialogPrimitive.Root;
+const Portal = DialogPrimitive.Portal;
+
+export {
+	Root,
+	Title,
+	Portal,
+	Footer,
+	Header,
+	Trigger,
+	Overlay,
+	Content,
+	Description,
+	Close,
+	//
+	Root as Dialog,
+	Title as DialogTitle,
+	Portal as DialogPortal,
+	Footer as DialogFooter,
+	Header as DialogHeader,
+	Trigger as DialogTrigger,
+	Overlay as DialogOverlay,
+	Content as DialogContent,
+	Description as DialogDescription,
+	Close as DialogClose
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-checkbox-item.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-checkbox-item.svelte
new file mode 100644
index 0000000..e71acef
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-checkbox-item.svelte
@@ -0,0 +1,41 @@
+<script lang="ts">
+	import { DropdownMenu as DropdownMenuPrimitive } from 'bits-ui';
+	import CheckIcon from '@lucide/svelte/icons/check';
+	import MinusIcon from '@lucide/svelte/icons/minus';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+	import type { Snippet } from 'svelte';
+
+	let {
+		ref = $bindable(null),
+		checked = $bindable(false),
+		indeterminate = $bindable(false),
+		class: className,
+		children: childrenProp,
+		...restProps
+	}: WithoutChildrenOrChild<DropdownMenuPrimitive.CheckboxItemProps> & {
+		children?: Snippet;
+	} = $props();
+</script>
+
+<DropdownMenuPrimitive.CheckboxItem
+	bind:ref
+	bind:checked
+	bind:indeterminate
+	data-slot="dropdown-menu-checkbox-item"
+	class={cn(
+		"relative flex cursor-default items-center gap-2 rounded-sm py-1.5 pr-2 pl-8 text-sm outline-hidden select-none focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
+		className
+	)}
+	{...restProps}
+>
+	{#snippet children({ checked, indeterminate })}
+		<span class="pointer-events-none absolute left-2 flex size-3.5 items-center justify-center">
+			{#if indeterminate}
+				<MinusIcon class="size-4" />
+			{:else}
+				<CheckIcon class={cn('size-4', !checked && 'text-transparent')} />
+			{/if}
+		</span>
+		{@render childrenProp?.()}
+	{/snippet}
+</DropdownMenuPrimitive.CheckboxItem>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-content.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-content.svelte
new file mode 100644
index 0000000..869c38e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-content.svelte
@@ -0,0 +1,27 @@
+<script lang="ts">
+	import { cn } from '$lib/components/ui/utils.js';
+	import { DropdownMenu as DropdownMenuPrimitive } from 'bits-ui';
+
+	let {
+		ref = $bindable(null),
+		sideOffset = 4,
+		portalProps,
+		class: className,
+		...restProps
+	}: DropdownMenuPrimitive.ContentProps & {
+		portalProps?: DropdownMenuPrimitive.PortalProps;
+	} = $props();
+</script>
+
+<DropdownMenuPrimitive.Portal {...portalProps}>
+	<DropdownMenuPrimitive.Content
+		bind:ref
+		data-slot="dropdown-menu-content"
+		{sideOffset}
+		class={cn(
+			'z-50 max-h-(--bits-dropdown-menu-content-available-height) min-w-[8rem] origin-(--bits-dropdown-menu-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border border-border bg-popover p-1 text-popover-foreground shadow-md outline-none data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 dark:border-border/20',
+			className
+		)}
+		{...restProps}
+	/>
+</DropdownMenuPrimitive.Portal>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-group-heading.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-group-heading.svelte
new file mode 100644
index 0000000..f217966
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-group-heading.svelte
@@ -0,0 +1,22 @@
+<script lang="ts">
+	import { DropdownMenu as DropdownMenuPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+	import type { ComponentProps } from 'svelte';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		inset,
+		...restProps
+	}: ComponentProps<typeof DropdownMenuPrimitive.GroupHeading> & {
+		inset?: boolean;
+	} = $props();
+</script>
+
+<DropdownMenuPrimitive.GroupHeading
+	bind:ref
+	data-slot="dropdown-menu-group-heading"
+	data-inset={inset}
+	class={cn('px-2 py-1.5 text-sm font-semibold data-[inset]:pl-8', className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-group.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-group.svelte
new file mode 100644
index 0000000..261ab7e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-group.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { DropdownMenu as DropdownMenuPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: DropdownMenuPrimitive.GroupProps = $props();
+</script>
+
+<DropdownMenuPrimitive.Group bind:ref data-slot="dropdown-menu-group" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-item.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-item.svelte
new file mode 100644
index 0000000..1ac5615
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-item.svelte
@@ -0,0 +1,27 @@
+<script lang="ts">
+	import { cn } from '$lib/components/ui/utils.js';
+	import { DropdownMenu as DropdownMenuPrimitive } from 'bits-ui';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		inset,
+		variant = 'default',
+		...restProps
+	}: DropdownMenuPrimitive.ItemProps & {
+		inset?: boolean;
+		variant?: 'default' | 'destructive';
+	} = $props();
+</script>
+
+<DropdownMenuPrimitive.Item
+	bind:ref
+	data-slot="dropdown-menu-item"
+	data-inset={inset}
+	data-variant={variant}
+	class={cn(
+		"relative flex cursor-pointer items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-highlighted:bg-accent data-highlighted:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[inset]:pl-8 data-[variant=destructive]:text-destructive data-[variant=destructive]:data-highlighted:bg-destructive/10 data-[variant=destructive]:data-highlighted:text-destructive dark:data-[variant=destructive]:data-highlighted:bg-destructive/20 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&_svg:not([class*='text-'])]:text-muted-foreground data-[variant=destructive]:*:[svg]:!text-destructive",
+		className
+	)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-label.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-label.svelte
new file mode 100644
index 0000000..15b546e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-label.svelte
@@ -0,0 +1,24 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		inset,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> & {
+		inset?: boolean;
+	} = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="dropdown-menu-label"
+	data-inset={inset}
+	class={cn('px-2 py-1.5 text-sm font-semibold data-[inset]:pl-8', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-radio-group.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-radio-group.svelte
new file mode 100644
index 0000000..3e98749
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-radio-group.svelte
@@ -0,0 +1,16 @@
+<script lang="ts">
+	import { DropdownMenu as DropdownMenuPrimitive } from 'bits-ui';
+
+	let {
+		ref = $bindable(null),
+		value = $bindable(),
+		...restProps
+	}: DropdownMenuPrimitive.RadioGroupProps = $props();
+</script>
+
+<DropdownMenuPrimitive.RadioGroup
+	bind:ref
+	bind:value
+	data-slot="dropdown-menu-radio-group"
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-radio-item.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-radio-item.svelte
new file mode 100644
index 0000000..97ba772
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-radio-item.svelte
@@ -0,0 +1,31 @@
+<script lang="ts">
+	import { DropdownMenu as DropdownMenuPrimitive } from 'bits-ui';
+	import CircleIcon from '@lucide/svelte/icons/circle';
+	import { cn, type WithoutChild } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children: childrenProp,
+		...restProps
+	}: WithoutChild<DropdownMenuPrimitive.RadioItemProps> = $props();
+</script>
+
+<DropdownMenuPrimitive.RadioItem
+	bind:ref
+	data-slot="dropdown-menu-radio-item"
+	class={cn(
+		"relative flex cursor-default items-center gap-2 rounded-sm py-1.5 pr-2 pl-8 text-sm outline-hidden select-none focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
+		className
+	)}
+	{...restProps}
+>
+	{#snippet children({ checked })}
+		<span class="pointer-events-none absolute left-2 flex size-3.5 items-center justify-center">
+			{#if checked}
+				<CircleIcon class="size-2 fill-current" />
+			{/if}
+		</span>
+		{@render childrenProp?.({ checked })}
+	{/snippet}
+</DropdownMenuPrimitive.RadioItem>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-separator.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-separator.svelte
new file mode 100644
index 0000000..17b64ac
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-separator.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import { DropdownMenu as DropdownMenuPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: DropdownMenuPrimitive.SeparatorProps = $props();
+</script>
+
+<DropdownMenuPrimitive.Separator
+	bind:ref
+	data-slot="dropdown-menu-separator"
+	class={cn('-mx-1 my-1 h-px bg-border/20', className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-shortcut.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-shortcut.svelte
new file mode 100644
index 0000000..c3ccc21
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-shortcut.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLSpanElement>> = $props();
+</script>
+
+<span
+	bind:this={ref}
+	data-slot="dropdown-menu-shortcut"
+	class={cn('ml-auto text-xs tracking-widest text-muted-foreground', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</span>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-sub-content.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-sub-content.svelte
new file mode 100644
index 0000000..3ceb165
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-sub-content.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { DropdownMenu as DropdownMenuPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: DropdownMenuPrimitive.SubContentProps = $props();
+</script>
+
+<DropdownMenuPrimitive.SubContent
+	bind:ref
+	data-slot="dropdown-menu-sub-content"
+	class={cn(
+		'z-50 min-w-[8rem] origin-(--bits-dropdown-menu-content-transform-origin) overflow-hidden rounded-md border bg-popover p-1 text-popover-foreground shadow-lg data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
+		className
+	)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-sub-trigger.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-sub-trigger.svelte
new file mode 100644
index 0000000..550a789
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-sub-trigger.svelte
@@ -0,0 +1,29 @@
+<script lang="ts">
+	import { DropdownMenu as DropdownMenuPrimitive } from 'bits-ui';
+	import ChevronRightIcon from '@lucide/svelte/icons/chevron-right';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		inset,
+		children,
+		...restProps
+	}: DropdownMenuPrimitive.SubTriggerProps & {
+		inset?: boolean;
+	} = $props();
+</script>
+
+<DropdownMenuPrimitive.SubTrigger
+	bind:ref
+	data-slot="dropdown-menu-sub-trigger"
+	data-inset={inset}
+	class={cn(
+		"flex cursor-default items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-highlighted:bg-accent data-highlighted:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[inset]:pl-8 data-[state=open]:bg-accent data-[state=open]:text-accent-foreground [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&_svg:not([class*='text-'])]:text-muted-foreground",
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+	<ChevronRightIcon class="ml-auto size-4" />
+</DropdownMenuPrimitive.SubTrigger>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-trigger.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-trigger.svelte
new file mode 100644
index 0000000..032b645
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-trigger.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { DropdownMenu as DropdownMenuPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: DropdownMenuPrimitive.TriggerProps = $props();
+</script>
+
+<DropdownMenuPrimitive.Trigger bind:ref data-slot="dropdown-menu-trigger" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/index.ts
new file mode 100644
index 0000000..aeb398e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/index.ts
@@ -0,0 +1,49 @@
+import { DropdownMenu as DropdownMenuPrimitive } from 'bits-ui';
+import CheckboxItem from './dropdown-menu-checkbox-item.svelte';
+import Content from './dropdown-menu-content.svelte';
+import Group from './dropdown-menu-group.svelte';
+import Item from './dropdown-menu-item.svelte';
+import Label from './dropdown-menu-label.svelte';
+import RadioGroup from './dropdown-menu-radio-group.svelte';
+import RadioItem from './dropdown-menu-radio-item.svelte';
+import Separator from './dropdown-menu-separator.svelte';
+import Shortcut from './dropdown-menu-shortcut.svelte';
+import Trigger from './dropdown-menu-trigger.svelte';
+import SubContent from './dropdown-menu-sub-content.svelte';
+import SubTrigger from './dropdown-menu-sub-trigger.svelte';
+import GroupHeading from './dropdown-menu-group-heading.svelte';
+const Sub = DropdownMenuPrimitive.Sub;
+const Root = DropdownMenuPrimitive.Root;
+
+export {
+	CheckboxItem,
+	Content,
+	Root as DropdownMenu,
+	CheckboxItem as DropdownMenuCheckboxItem,
+	Content as DropdownMenuContent,
+	Group as DropdownMenuGroup,
+	Item as DropdownMenuItem,
+	Label as DropdownMenuLabel,
+	RadioGroup as DropdownMenuRadioGroup,
+	RadioItem as DropdownMenuRadioItem,
+	Separator as DropdownMenuSeparator,
+	Shortcut as DropdownMenuShortcut,
+	Sub as DropdownMenuSub,
+	SubContent as DropdownMenuSubContent,
+	SubTrigger as DropdownMenuSubTrigger,
+	Trigger as DropdownMenuTrigger,
+	GroupHeading as DropdownMenuGroupHeading,
+	Group,
+	GroupHeading,
+	Item,
+	Label,
+	RadioGroup,
+	RadioItem,
+	Root,
+	Separator,
+	Shortcut,
+	Sub,
+	SubContent,
+	SubTrigger,
+	Trigger
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/input/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/input/index.ts
new file mode 100644
index 0000000..15c0933
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/input/index.ts
@@ -0,0 +1,7 @@
+import Root from './input.svelte';
+
+export {
+	Root,
+	//
+	Root as Input
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/input/input.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/input/input.svelte
new file mode 100644
index 0000000..889b720
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/input/input.svelte
@@ -0,0 +1,51 @@
+<script lang="ts">
+	import type { HTMLInputAttributes, HTMLInputTypeAttribute } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils';
+
+	type InputType = Exclude<HTMLInputTypeAttribute, 'file'>;
+
+	type Props = WithElementRef<
+		Omit<HTMLInputAttributes, 'type'> &
+			({ type: 'file'; files?: FileList } | { type?: InputType; files?: undefined })
+	>;
+
+	let {
+		ref = $bindable(null),
+		value = $bindable(),
+		type,
+		files = $bindable(),
+		class: className,
+		...restProps
+	}: Props = $props();
+</script>
+
+{#if type === 'file'}
+	<input
+		bind:this={ref}
+		data-slot="input"
+		class={cn(
+			'flex h-9 w-full min-w-0 rounded-md border border-input bg-transparent px-3 pt-1.5 text-sm font-medium shadow-xs ring-offset-background transition-[color,box-shadow] outline-none selection:bg-primary selection:text-primary-foreground placeholder:text-muted-foreground disabled:cursor-not-allowed disabled:opacity-50 md:text-sm dark:bg-input/30',
+			'focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50',
+			'aria-invalid:border-destructive aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40',
+			className
+		)}
+		type="file"
+		bind:files
+		bind:value
+		{...restProps}
+	/>
+{:else}
+	<input
+		bind:this={ref}
+		data-slot="input"
+		class={cn(
+			'flex h-9 w-full min-w-0 rounded-md border border-input bg-background px-3 py-1 text-base shadow-xs ring-offset-background transition-[color,box-shadow] outline-none selection:bg-primary selection:text-primary-foreground placeholder:text-muted-foreground disabled:cursor-not-allowed disabled:opacity-50 md:text-sm dark:bg-input/30',
+			'focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50',
+			'aria-invalid:border-destructive aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40',
+			className
+		)}
+		{type}
+		bind:value
+		{...restProps}
+	/>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/label/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/label/index.ts
new file mode 100644
index 0000000..808d141
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/label/index.ts
@@ -0,0 +1,7 @@
+import Root from './label.svelte';
+
+export {
+	Root,
+	//
+	Root as Label
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/label/label.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/label/label.svelte
new file mode 100644
index 0000000..9da4ae3
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/label/label.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { Label as LabelPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: LabelPrimitive.RootProps = $props();
+</script>
+
+<LabelPrimitive.Root
+	bind:ref
+	data-slot="label"
+	class={cn(
+		'flex items-center gap-2 text-sm leading-none font-medium select-none group-data-[disabled=true]:pointer-events-none group-data-[disabled=true]:opacity-50 peer-disabled:cursor-not-allowed peer-disabled:opacity-50',
+		className
+	)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/popover/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/popover/index.ts
new file mode 100644
index 0000000..c5937fb
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/popover/index.ts
@@ -0,0 +1,19 @@
+import Root from './popover.svelte';
+import Close from './popover-close.svelte';
+import Content from './popover-content.svelte';
+import Trigger from './popover-trigger.svelte';
+import Portal from './popover-portal.svelte';
+
+export {
+	Root,
+	Content,
+	Trigger,
+	Close,
+	Portal,
+	//
+	Root as Popover,
+	Content as PopoverContent,
+	Trigger as PopoverTrigger,
+	Close as PopoverClose,
+	Portal as PopoverPortal
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte
new file mode 100644
index 0000000..dc4dec4
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: PopoverPrimitive.CloseProps = $props();
+</script>
+
+<PopoverPrimitive.Close bind:ref data-slot="popover-close" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte
new file mode 100644
index 0000000..2d3513d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte
@@ -0,0 +1,37 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+	import PopoverPortal from './popover-portal.svelte';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+	import type { ComponentProps } from 'svelte';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		sideOffset = 4,
+		side,
+		align = 'center',
+		collisionPadding = 8,
+		avoidCollisions = true,
+		portalProps,
+		...restProps
+	}: PopoverPrimitive.ContentProps & {
+		portalProps?: WithoutChildrenOrChild<ComponentProps<typeof PopoverPortal>>;
+	} = $props();
+</script>
+
+<PopoverPortal {...portalProps}>
+	<PopoverPrimitive.Content
+		bind:ref
+		data-slot="popover-content"
+		{sideOffset}
+		{side}
+		{align}
+		{collisionPadding}
+		{avoidCollisions}
+		class={cn(
+			'z-50 w-72 origin-(--bits-popover-content-transform-origin) rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-hidden data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-end-2 data-[side=right]:slide-in-from-start-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
+			className
+		)}
+		{...restProps}
+	/>
+</PopoverPortal>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte
new file mode 100644
index 0000000..25efb87
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let { ...restProps }: PopoverPrimitive.PortalProps = $props();
+</script>
+
+<PopoverPrimitive.Portal {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte
new file mode 100644
index 0000000..5ef3d0e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import { cn } from '$lib/components/ui/utils.js';
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: PopoverPrimitive.TriggerProps = $props();
+</script>
+
+<PopoverPrimitive.Trigger
+	bind:ref
+	data-slot="popover-trigger"
+	class={cn('', className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover.svelte
new file mode 100644
index 0000000..f39b867
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Popover as PopoverPrimitive } from 'bits-ui';
+
+	let { open = $bindable(false), ...restProps }: PopoverPrimitive.RootProps = $props();
+</script>
+
+<PopoverPrimitive.Root bind:open {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/index.ts
new file mode 100644
index 0000000..d546806
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/index.ts
@@ -0,0 +1,10 @@
+import Scrollbar from './scroll-area-scrollbar.svelte';
+import Root from './scroll-area.svelte';
+
+export {
+	Root,
+	Scrollbar,
+	//,
+	Root as ScrollArea,
+	Scrollbar as ScrollAreaScrollbar
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/scroll-area-scrollbar.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/scroll-area-scrollbar.svelte
new file mode 100644
index 0000000..3f0d00d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/scroll-area-scrollbar.svelte
@@ -0,0 +1,31 @@
+<script lang="ts">
+	import { ScrollArea as ScrollAreaPrimitive } from 'bits-ui';
+	import { cn, type WithoutChild } from '$lib/components/ui/utils';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		orientation = 'vertical',
+		children,
+		...restProps
+	}: WithoutChild<ScrollAreaPrimitive.ScrollbarProps> = $props();
+</script>
+
+<ScrollAreaPrimitive.Scrollbar
+	bind:ref
+	data-slot="scroll-area-scrollbar"
+	{orientation}
+	class={cn(
+		'flex touch-none p-px transition-colors select-none',
+		orientation === 'vertical' && 'h-full w-2.5 border-l border-l-transparent',
+		orientation === 'horizontal' && 'h-2.5 flex-col border-t border-t-transparent',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+	<ScrollAreaPrimitive.Thumb
+		data-slot="scroll-area-thumb"
+		class="relative flex-1 rounded-full bg-border"
+	/>
+</ScrollAreaPrimitive.Scrollbar>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/scroll-area.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/scroll-area.svelte
new file mode 100644
index 0000000..ba6f838
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/scroll-area.svelte
@@ -0,0 +1,40 @@
+<script lang="ts">
+	import { ScrollArea as ScrollAreaPrimitive } from 'bits-ui';
+	import { Scrollbar } from './index.js';
+	import { cn, type WithoutChild } from '$lib/components/ui/utils';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		orientation = 'vertical',
+		scrollbarXClasses = '',
+		scrollbarYClasses = '',
+		children,
+		...restProps
+	}: WithoutChild<ScrollAreaPrimitive.RootProps> & {
+		orientation?: 'vertical' | 'horizontal' | 'both' | undefined;
+		scrollbarXClasses?: string | undefined;
+		scrollbarYClasses?: string | undefined;
+	} = $props();
+</script>
+
+<ScrollAreaPrimitive.Root
+	bind:ref
+	data-slot="scroll-area"
+	class={cn('relative', className)}
+	{...restProps}
+>
+	<ScrollAreaPrimitive.Viewport
+		data-slot="scroll-area-viewport"
+		class="size-full rounded-[inherit] ring-ring/10 outline-ring/50 transition-[color,box-shadow] focus-visible:ring-4 focus-visible:outline-1 dark:ring-ring/20 dark:outline-ring/40"
+	>
+		{@render children?.()}
+	</ScrollAreaPrimitive.Viewport>
+	{#if orientation === 'vertical' || orientation === 'both'}
+		<Scrollbar orientation="vertical" class={scrollbarYClasses} />
+	{/if}
+	{#if orientation === 'horizontal' || orientation === 'both'}
+		<Scrollbar orientation="horizontal" class={scrollbarXClasses} />
+	{/if}
+	<ScrollAreaPrimitive.Corner />
+</ScrollAreaPrimitive.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/select/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/select/index.ts
new file mode 100644
index 0000000..bfa73d9
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/select/index.ts
@@ -0,0 +1,37 @@
+import { Select as SelectPrimitive } from 'bits-ui';
+
+import Group from './select-group.svelte';
+import Label from './select-label.svelte';
+import Item from './select-item.svelte';
+import Content from './select-content.svelte';
+import Trigger from './select-trigger.svelte';
+import Separator from './select-separator.svelte';
+import ScrollDownButton from './select-scroll-down-button.svelte';
+import ScrollUpButton from './select-scroll-up-button.svelte';
+import GroupHeading from './select-group-heading.svelte';
+
+const Root = SelectPrimitive.Root;
+
+export {
+	Root,
+	Group,
+	Label,
+	Item,
+	Content,
+	Trigger,
+	Separator,
+	ScrollDownButton,
+	ScrollUpButton,
+	GroupHeading,
+	//
+	Root as Select,
+	Group as SelectGroup,
+	Label as SelectLabel,
+	Item as SelectItem,
+	Content as SelectContent,
+	Trigger as SelectTrigger,
+	Separator as SelectSeparator,
+	ScrollDownButton as SelectScrollDownButton,
+	ScrollUpButton as SelectScrollUpButton,
+	GroupHeading as SelectGroupHeading
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-content.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-content.svelte
new file mode 100644
index 0000000..4050628
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-content.svelte
@@ -0,0 +1,111 @@
+<script lang="ts">
+	import { onDestroy, onMount } from 'svelte';
+	import { Select as SelectPrimitive } from 'bits-ui';
+	import SelectScrollUpButton from './select-scroll-up-button.svelte';
+	import SelectScrollDownButton from './select-scroll-down-button.svelte';
+	import { cn, type WithoutChild } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		sideOffset = 4,
+		portalProps,
+		children,
+		...restProps
+	}: WithoutChild<SelectPrimitive.ContentProps> & {
+		portalProps?: SelectPrimitive.PortalProps;
+	} = $props();
+
+	let cleanupInternalListeners: (() => void) | undefined;
+
+	onMount(() => {
+		const listenerOptions: AddEventListenerOptions = { passive: false };
+
+		const blockOutsideWheel = (event: WheelEvent) => {
+			if (!ref) {
+				return;
+			}
+
+			const target = event.target as Node | null;
+
+			if (!target || !ref.contains(target)) {
+				event.preventDefault();
+				event.stopPropagation();
+			}
+		};
+
+		const blockOutsideTouchMove = (event: TouchEvent) => {
+			if (!ref) {
+				return;
+			}
+
+			const target = event.target as Node | null;
+
+			if (!target || !ref.contains(target)) {
+				event.preventDefault();
+				event.stopPropagation();
+			}
+		};
+
+		document.addEventListener('wheel', blockOutsideWheel, listenerOptions);
+		document.addEventListener('touchmove', blockOutsideTouchMove, listenerOptions);
+
+		return () => {
+			document.removeEventListener('wheel', blockOutsideWheel, listenerOptions);
+			document.removeEventListener('touchmove', blockOutsideTouchMove, listenerOptions);
+		};
+	});
+
+	$effect(() => {
+		const element = ref;
+
+		cleanupInternalListeners?.();
+
+		if (!element) {
+			return;
+		}
+
+		const stopWheelPropagation = (event: WheelEvent) => {
+			event.stopPropagation();
+		};
+
+		const stopTouchPropagation = (event: TouchEvent) => {
+			event.stopPropagation();
+		};
+
+		element.addEventListener('wheel', stopWheelPropagation);
+		element.addEventListener('touchmove', stopTouchPropagation);
+
+		cleanupInternalListeners = () => {
+			element.removeEventListener('wheel', stopWheelPropagation);
+			element.removeEventListener('touchmove', stopTouchPropagation);
+		};
+	});
+
+	onDestroy(() => {
+		cleanupInternalListeners?.();
+	});
+</script>
+
+<SelectPrimitive.Portal {...portalProps}>
+	<SelectPrimitive.Content
+		bind:ref
+		{sideOffset}
+		data-slot="select-content"
+		class={cn(
+			'relative z-[var(--layer-popover,1000000)] max-h-(--bits-select-content-available-height) min-w-[8rem] origin-(--bits-select-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border bg-popover text-popover-foreground shadow-md data-[side=bottom]:translate-y-1 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:-translate-x-1 data-[side=left]:slide-in-from-right-2 data-[side=right]:translate-x-1 data-[side=right]:slide-in-from-left-2 data-[side=top]:-translate-y-1 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
+			className
+		)}
+		{...restProps}
+	>
+		<SelectScrollUpButton />
+		<SelectPrimitive.Viewport
+			class={cn(
+				'h-(--bits-select-anchor-height) w-full min-w-(--bits-select-anchor-width) scroll-my-1 p-1'
+			)}
+		>
+			{@render children?.()}
+		</SelectPrimitive.Viewport>
+		<SelectScrollDownButton />
+	</SelectPrimitive.Content>
+</SelectPrimitive.Portal>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-group-heading.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-group-heading.svelte
new file mode 100644
index 0000000..77c2042
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-group-heading.svelte
@@ -0,0 +1,21 @@
+<script lang="ts">
+	import { Select as SelectPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+	import type { ComponentProps } from 'svelte';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: ComponentProps<typeof SelectPrimitive.GroupHeading> = $props();
+</script>
+
+<SelectPrimitive.GroupHeading
+	bind:ref
+	data-slot="select-group-heading"
+	class={cn('px-2 py-1.5 text-xs text-muted-foreground', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</SelectPrimitive.GroupHeading>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-group.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-group.svelte
new file mode 100644
index 0000000..2520795
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-group.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Select as SelectPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: SelectPrimitive.GroupProps = $props();
+</script>
+
+<SelectPrimitive.Group data-slot="select-group" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-item.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-item.svelte
new file mode 100644
index 0000000..02543c1
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-item.svelte
@@ -0,0 +1,38 @@
+<script lang="ts">
+	import CheckIcon from '@lucide/svelte/icons/check';
+	import { Select as SelectPrimitive } from 'bits-ui';
+	import { cn, type WithoutChild } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		value,
+		label,
+		children: childrenProp,
+		...restProps
+	}: WithoutChild<SelectPrimitive.ItemProps> = $props();
+</script>
+
+<SelectPrimitive.Item
+	bind:ref
+	{value}
+	data-slot="select-item"
+	class={cn(
+		"relative flex w-full cursor-default items-center gap-2 rounded-sm py-1.5 pr-8 pl-2 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[highlighted]:bg-accent data-[highlighted]:text-accent-foreground [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&_svg:not([class*='text-'])]:text-muted-foreground *:[span]:last:flex *:[span]:last:items-center *:[span]:last:gap-2",
+		className
+	)}
+	{...restProps}
+>
+	{#snippet children({ selected, highlighted })}
+		<span class="absolute right-2 flex size-3.5 items-center justify-center">
+			{#if selected}
+				<CheckIcon class="size-4" />
+			{/if}
+		</span>
+		{#if childrenProp}
+			{@render childrenProp({ selected, highlighted })}
+		{:else}
+			{label || value}
+		{/if}
+	{/snippet}
+</SelectPrimitive.Item>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-label.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-label.svelte
new file mode 100644
index 0000000..e2b830c
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-label.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> & {} = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="select-label"
+	class={cn('px-2 py-1.5 text-xs text-muted-foreground', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-scroll-down-button.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-scroll-down-button.svelte
new file mode 100644
index 0000000..9256dd8
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-scroll-down-button.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import ChevronDownIcon from '@lucide/svelte/icons/chevron-down';
+	import { Select as SelectPrimitive } from 'bits-ui';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: WithoutChildrenOrChild<SelectPrimitive.ScrollDownButtonProps> = $props();
+</script>
+
+<SelectPrimitive.ScrollDownButton
+	bind:ref
+	data-slot="select-scroll-down-button"
+	class={cn('flex cursor-default items-center justify-center py-1', className)}
+	{...restProps}
+>
+	<ChevronDownIcon class="size-4" />
+</SelectPrimitive.ScrollDownButton>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-scroll-up-button.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-scroll-up-button.svelte
new file mode 100644
index 0000000..552e527
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-scroll-up-button.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import ChevronUpIcon from '@lucide/svelte/icons/chevron-up';
+	import { Select as SelectPrimitive } from 'bits-ui';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: WithoutChildrenOrChild<SelectPrimitive.ScrollUpButtonProps> = $props();
+</script>
+
+<SelectPrimitive.ScrollUpButton
+	bind:ref
+	data-slot="select-scroll-up-button"
+	class={cn('flex cursor-default items-center justify-center py-1', className)}
+	{...restProps}
+>
+	<ChevronUpIcon class="size-4" />
+</SelectPrimitive.ScrollUpButton>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-separator.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-separator.svelte
new file mode 100644
index 0000000..7daaa8d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-separator.svelte
@@ -0,0 +1,18 @@
+<script lang="ts">
+	import type { Separator as SeparatorPrimitive } from 'bits-ui';
+	import { Separator } from '$lib/components/ui/separator/index.js';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: SeparatorPrimitive.RootProps = $props();
+</script>
+
+<Separator
+	bind:ref
+	data-slot="select-separator"
+	class={cn('pointer-events-none -mx-1 my-1 h-px bg-border', className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-trigger.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-trigger.svelte
new file mode 100644
index 0000000..5bc28ee
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/select/select-trigger.svelte
@@ -0,0 +1,40 @@
+<script lang="ts">
+	import { Select as SelectPrimitive } from 'bits-ui';
+	import ChevronDownIcon from '@lucide/svelte/icons/chevron-down';
+	import { cn, type WithoutChild } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		size = 'default',
+		variant = 'default',
+		...restProps
+	}: WithoutChild<SelectPrimitive.TriggerProps> & {
+		size?: 'sm' | 'default';
+		variant?: 'default' | 'plain';
+	} = $props();
+
+	const baseClasses = $derived(
+		variant === 'plain'
+			? "group inline-flex w-full items-center justify-end gap-2 whitespace-nowrap px-0 py-0 text-sm font-medium text-muted-foreground transition-colors focus-visible:outline-none focus-visible:ring-0 focus-visible:ring-offset-0 disabled:cursor-not-allowed disabled:opacity-50 data-[placeholder]:text-muted-foreground data-[size=default]:h-9 data-[size=sm]:h-8 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-3 [&_svg:not([class*='text-'])]:text-muted-foreground"
+			: "flex w-fit items-center justify-between gap-2 rounded-md border border-input bg-transparent px-3 py-2 text-sm whitespace-nowrap shadow-xs transition-[color,box-shadow] outline-none select-none focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50 disabled:cursor-not-allowed disabled:opacity-50 aria-invalid:border-destructive aria-invalid:ring-destructive/20 data-[placeholder]:text-muted-foreground data-[size=default]:h-9 data-[size=sm]:h-8 *:data-[slot=select-value]:line-clamp-1 *:data-[slot=select-value]:flex *:data-[slot=select-value]:items-center *:data-[slot=select-value]:gap-2 dark:bg-input/30 dark:hover:bg-input/50 dark:aria-invalid:ring-destructive/40 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&_svg:not([class*='text-'])]:text-muted-foreground"
+	);
+
+	const chevronClasses = $derived(
+		variant === 'plain'
+			? 'size-3 opacity-60 transition-transform group-data-[state=open]:-rotate-180'
+			: 'size-4 opacity-50'
+	);
+</script>
+
+<SelectPrimitive.Trigger
+	bind:ref
+	data-slot="select-trigger"
+	data-size={size}
+	class={cn(baseClasses, className)}
+	{...restProps}
+>
+	{@render children?.()}
+	<ChevronDownIcon class={chevronClasses} />
+</SelectPrimitive.Trigger>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/separator/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/separator/index.ts
new file mode 100644
index 0000000..768efac
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/separator/index.ts
@@ -0,0 +1,7 @@
+import Root from './separator.svelte';
+
+export {
+	Root,
+	//
+	Root as Separator
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/separator/separator.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/separator/separator.svelte
new file mode 100644
index 0000000..00307fd
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/separator/separator.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { Separator as SeparatorPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: SeparatorPrimitive.RootProps = $props();
+</script>
+
+<SeparatorPrimitive.Root
+	bind:ref
+	data-slot="separator"
+	class={cn(
+		'shrink-0 bg-border data-[orientation=horizontal]:h-px data-[orientation=horizontal]:w-full data-[orientation=vertical]:h-full data-[orientation=vertical]:w-px',
+		className
+	)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/index.ts
new file mode 100644
index 0000000..139e2d2
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/index.ts
@@ -0,0 +1,36 @@
+import { Dialog as SheetPrimitive } from 'bits-ui';
+import Trigger from './sheet-trigger.svelte';
+import Close from './sheet-close.svelte';
+import Overlay from './sheet-overlay.svelte';
+import Content from './sheet-content.svelte';
+import Header from './sheet-header.svelte';
+import Footer from './sheet-footer.svelte';
+import Title from './sheet-title.svelte';
+import Description from './sheet-description.svelte';
+
+const Root = SheetPrimitive.Root;
+const Portal = SheetPrimitive.Portal;
+
+export {
+	Root,
+	Close,
+	Trigger,
+	Portal,
+	Overlay,
+	Content,
+	Header,
+	Footer,
+	Title,
+	Description,
+	//
+	Root as Sheet,
+	Close as SheetClose,
+	Trigger as SheetTrigger,
+	Portal as SheetPortal,
+	Overlay as SheetOverlay,
+	Content as SheetContent,
+	Header as SheetHeader,
+	Footer as SheetFooter,
+	Title as SheetTitle,
+	Description as SheetDescription
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-close.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-close.svelte
new file mode 100644
index 0000000..b0180c0
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-close.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Dialog as SheetPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: SheetPrimitive.CloseProps = $props();
+</script>
+
+<SheetPrimitive.Close bind:ref data-slot="sheet-close" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-content.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-content.svelte
new file mode 100644
index 0000000..f16a0e0
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-content.svelte
@@ -0,0 +1,60 @@
+<script lang="ts" module>
+	import { tv, type VariantProps } from 'tailwind-variants';
+	export const sheetVariants = tv({
+		base: 'bg-background data-[state=open]:animate-in data-[state=closed]:animate-out fixed z-50 flex flex-col gap-4 shadow-lg transition ease-in-out data-[state=closed]:duration-300 data-[state=open]:duration-500',
+		variants: {
+			side: {
+				top: 'data-[state=closed]:slide-out-to-top data-[state=open]:slide-in-from-top inset-x-0 top-0 h-auto border-b',
+				bottom:
+					'data-[state=closed]:slide-out-to-bottom data-[state=open]:slide-in-from-bottom inset-x-0 bottom-0 h-auto border-t',
+				left: 'data-[state=closed]:slide-out-to-left data-[state=open]:slide-in-from-left inset-y-0 left-0 h-full w-3/4 border-r sm:max-w-sm',
+				right:
+					'data-[state=closed]:slide-out-to-right data-[state=open]:slide-in-from-right inset-y-0 right-0 h-full w-3/4 border-l sm:max-w-sm'
+			}
+		},
+		defaultVariants: {
+			side: 'right'
+		}
+	});
+
+	export type Side = VariantProps<typeof sheetVariants>['side'];
+</script>
+
+<script lang="ts">
+	import { Dialog as SheetPrimitive } from 'bits-ui';
+	import XIcon from '@lucide/svelte/icons/x';
+	import type { Snippet } from 'svelte';
+	import SheetOverlay from './sheet-overlay.svelte';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		side = 'right',
+		portalProps,
+		children,
+		...restProps
+	}: WithoutChildrenOrChild<SheetPrimitive.ContentProps> & {
+		portalProps?: SheetPrimitive.PortalProps;
+		side?: Side;
+		children: Snippet;
+	} = $props();
+</script>
+
+<SheetPrimitive.Portal {...portalProps}>
+	<SheetOverlay />
+	<SheetPrimitive.Content
+		bind:ref
+		data-slot="sheet-content"
+		class={cn(sheetVariants({ side }), className)}
+		{...restProps}
+	>
+		{@render children?.()}
+		<SheetPrimitive.Close
+			class="absolute top-4 right-4 rounded-xs opacity-70 ring-offset-background transition-opacity hover:opacity-100 focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 focus-visible:outline-hidden disabled:pointer-events-none"
+		>
+			<XIcon class="size-4" />
+			<span class="sr-only">Close</span>
+		</SheetPrimitive.Close>
+	</SheetPrimitive.Content>
+</SheetPrimitive.Portal>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-description.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-description.svelte
new file mode 100644
index 0000000..ef4d58f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-description.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import { Dialog as SheetPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: SheetPrimitive.DescriptionProps = $props();
+</script>
+
+<SheetPrimitive.Description
+	bind:ref
+	data-slot="sheet-description"
+	class={cn('text-sm text-muted-foreground', className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-footer.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-footer.svelte
new file mode 100644
index 0000000..4e1b927
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-footer.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="sheet-footer"
+	class={cn('mt-auto flex flex-col gap-2 p-4', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-header.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-header.svelte
new file mode 100644
index 0000000..6c6c1ec
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-header.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="sheet-header"
+	class={cn('flex flex-col gap-1.5 p-4', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-overlay.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-overlay.svelte
new file mode 100644
index 0000000..a6a064f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-overlay.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { Dialog as SheetPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: SheetPrimitive.OverlayProps = $props();
+</script>
+
+<SheetPrimitive.Overlay
+	bind:ref
+	data-slot="sheet-overlay"
+	class={cn(
+		'fixed inset-0 z-50 bg-black/50 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:animate-in data-[state=open]:fade-in-0',
+		className
+	)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-title.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-title.svelte
new file mode 100644
index 0000000..0efcc7a
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-title.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import { Dialog as SheetPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: SheetPrimitive.TitleProps = $props();
+</script>
+
+<SheetPrimitive.Title
+	bind:ref
+	data-slot="sheet-title"
+	class={cn('font-semibold text-foreground', className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-trigger.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-trigger.svelte
new file mode 100644
index 0000000..d95719a
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-trigger.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Dialog as SheetPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: SheetPrimitive.TriggerProps = $props();
+</script>
+
+<SheetPrimitive.Trigger bind:ref data-slot="sheet-trigger" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/constants.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/constants.ts
new file mode 100644
index 0000000..c7e827b
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/constants.ts
@@ -0,0 +1,6 @@
+export const SIDEBAR_COOKIE_NAME = 'sidebar:state';
+export const SIDEBAR_COOKIE_MAX_AGE = 60 * 60 * 24 * 7;
+export const SIDEBAR_WIDTH = '18rem';
+export const SIDEBAR_WIDTH_MOBILE = '18rem';
+export const SIDEBAR_WIDTH_ICON = '3rem';
+export const SIDEBAR_KEYBOARD_SHORTCUT = 'b';
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/context.svelte.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/context.svelte.ts
new file mode 100644
index 0000000..6fa2aa3
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/context.svelte.ts
@@ -0,0 +1,79 @@
+import { IsMobile } from '$lib/hooks/is-mobile.svelte.js';
+import { getContext, setContext } from 'svelte';
+import { SIDEBAR_KEYBOARD_SHORTCUT } from './constants.js';
+
+type Getter<T> = () => T;
+
+export type SidebarStateProps = {
+	/**
+	 * A getter function that returns the current open state of the sidebar.
+	 * We use a getter function here to support `bind:open` on the `Sidebar.Provider`
+	 * component.
+	 */
+	open: Getter<boolean>;
+
+	/**
+	 * A function that sets the open state of the sidebar. To support `bind:open`, we need
+	 * a source of truth for changing the open state to ensure it will be synced throughout
+	 * the sub-components and any `bind:` references.
+	 */
+	setOpen: (open: boolean) => void;
+};
+
+class SidebarState {
+	readonly props: SidebarStateProps;
+	open = $derived.by(() => this.props.open());
+	openMobile = $state(false);
+	setOpen: SidebarStateProps['setOpen'];
+	#isMobile: IsMobile;
+	state = $derived.by(() => (this.open ? 'expanded' : 'collapsed'));
+
+	constructor(props: SidebarStateProps) {
+		this.setOpen = props.setOpen;
+		this.#isMobile = new IsMobile();
+		this.props = props;
+	}
+
+	// Convenience getter for checking if the sidebar is mobile
+	// without this, we would need to use `sidebar.isMobile.current` everywhere
+	get isMobile() {
+		return this.#isMobile.current;
+	}
+
+	// Event handler to apply to the `<svelte:window>`
+	handleShortcutKeydown = (e: KeyboardEvent) => {
+		if (e.key === SIDEBAR_KEYBOARD_SHORTCUT && (e.metaKey || e.ctrlKey)) {
+			e.preventDefault();
+			this.toggle();
+		}
+	};
+
+	setOpenMobile = (value: boolean) => {
+		this.openMobile = value;
+	};
+
+	toggle = () => {
+		return this.#isMobile.current ? (this.openMobile = !this.openMobile) : this.setOpen(!this.open);
+	};
+}
+
+const SYMBOL_KEY = 'scn-sidebar';
+
+/**
+ * Instantiates a new `SidebarState` instance and sets it in the context.
+ *
+ * @param props The constructor props for the `SidebarState` class.
+ * @returns  The `SidebarState` instance.
+ */
+export function setSidebar(props: SidebarStateProps): SidebarState {
+	return setContext(Symbol.for(SYMBOL_KEY), new SidebarState(props));
+}
+
+/**
+ * Retrieves the `SidebarState` instance from the context. This is a class instance,
+ * so you cannot destructure it.
+ * @returns The `SidebarState` instance.
+ */
+export function useSidebar(): SidebarState {
+	return getContext(Symbol.for(SYMBOL_KEY));
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/index.ts
new file mode 100644
index 0000000..280e640
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/index.ts
@@ -0,0 +1,75 @@
+import { useSidebar } from './context.svelte.js';
+import Content from './sidebar-content.svelte';
+import Footer from './sidebar-footer.svelte';
+import GroupAction from './sidebar-group-action.svelte';
+import GroupContent from './sidebar-group-content.svelte';
+import GroupLabel from './sidebar-group-label.svelte';
+import Group from './sidebar-group.svelte';
+import Header from './sidebar-header.svelte';
+import Input from './sidebar-input.svelte';
+import Inset from './sidebar-inset.svelte';
+import MenuAction from './sidebar-menu-action.svelte';
+import MenuBadge from './sidebar-menu-badge.svelte';
+import MenuButton from './sidebar-menu-button.svelte';
+import MenuItem from './sidebar-menu-item.svelte';
+import MenuSkeleton from './sidebar-menu-skeleton.svelte';
+import MenuSubButton from './sidebar-menu-sub-button.svelte';
+import MenuSubItem from './sidebar-menu-sub-item.svelte';
+import MenuSub from './sidebar-menu-sub.svelte';
+import Menu from './sidebar-menu.svelte';
+import Provider from './sidebar-provider.svelte';
+import Rail from './sidebar-rail.svelte';
+import Separator from './sidebar-separator.svelte';
+import Trigger from './sidebar-trigger.svelte';
+import Root from './sidebar.svelte';
+
+export {
+	Content,
+	Footer,
+	Group,
+	GroupAction,
+	GroupContent,
+	GroupLabel,
+	Header,
+	Input,
+	Inset,
+	Menu,
+	MenuAction,
+	MenuBadge,
+	MenuButton,
+	MenuItem,
+	MenuSkeleton,
+	MenuSub,
+	MenuSubButton,
+	MenuSubItem,
+	Provider,
+	Rail,
+	Root,
+	Separator,
+	//
+	Root as Sidebar,
+	Content as SidebarContent,
+	Footer as SidebarFooter,
+	Group as SidebarGroup,
+	GroupAction as SidebarGroupAction,
+	GroupContent as SidebarGroupContent,
+	GroupLabel as SidebarGroupLabel,
+	Header as SidebarHeader,
+	Input as SidebarInput,
+	Inset as SidebarInset,
+	Menu as SidebarMenu,
+	MenuAction as SidebarMenuAction,
+	MenuBadge as SidebarMenuBadge,
+	MenuButton as SidebarMenuButton,
+	MenuItem as SidebarMenuItem,
+	MenuSkeleton as SidebarMenuSkeleton,
+	MenuSub as SidebarMenuSub,
+	MenuSubButton as SidebarMenuSubButton,
+	MenuSubItem as SidebarMenuSubItem,
+	Provider as SidebarProvider,
+	Rail as SidebarRail,
+	Separator as SidebarSeparator,
+	Trigger as SidebarTrigger,
+	Trigger,
+	useSidebar
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-content.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-content.svelte
new file mode 100644
index 0000000..0e5f75e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-content.svelte
@@ -0,0 +1,24 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="sidebar-content"
+	data-sidebar="content"
+	class={cn(
+		'flex min-h-0 flex-1 flex-col gap-2 overflow-auto group-data-[collapsible=icon]:overflow-hidden',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-footer.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-footer.svelte
new file mode 100644
index 0000000..67be0a4
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-footer.svelte
@@ -0,0 +1,21 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="sidebar-footer"
+	data-sidebar="footer"
+	class={cn('flex flex-col gap-2 p-2', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-action.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-action.svelte
new file mode 100644
index 0000000..027a711
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-action.svelte
@@ -0,0 +1,36 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { Snippet } from 'svelte';
+	import type { HTMLButtonAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		child,
+		...restProps
+	}: WithElementRef<HTMLButtonAttributes> & {
+		child?: Snippet<[{ props: Record<string, unknown> }]>;
+	} = $props();
+
+	const mergedProps = $derived({
+		class: cn(
+			'text-sidebar-foreground ring-sidebar-ring hover:bg-sidebar-accent hover:text-sidebar-accent-foreground outline-hidden absolute right-3 top-3.5 flex aspect-square w-5 items-center justify-center rounded-md p-0 transition-transform focus-visible:ring-2 [&>svg]:size-4 [&>svg]:shrink-0',
+			// Increases the hit area of the button on mobile.
+			'after:absolute after:-inset-2 md:after:hidden',
+			'group-data-[collapsible=icon]:hidden',
+			className
+		),
+		'data-slot': 'sidebar-group-action',
+		'data-sidebar': 'group-action',
+		...restProps
+	});
+</script>
+
+{#if child}
+	{@render child({ props: mergedProps })}
+{:else}
+	<button bind:this={ref} {...mergedProps}>
+		{@render children?.()}
+	</button>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-content.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-content.svelte
new file mode 100644
index 0000000..9e018fb
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-content.svelte
@@ -0,0 +1,21 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="sidebar-group-content"
+	data-sidebar="group-content"
+	class={cn('w-full text-sm', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-label.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-label.svelte
new file mode 100644
index 0000000..79f47d7
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-label.svelte
@@ -0,0 +1,34 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { Snippet } from 'svelte';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		children,
+		child,
+		class: className,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLElement>> & {
+		child?: Snippet<[{ props: Record<string, unknown> }]>;
+	} = $props();
+
+	const mergedProps = $derived({
+		class: cn(
+			'text-sidebar-foreground/70 ring-sidebar-ring outline-hidden flex h-8 shrink-0 items-center rounded-md px-2 text-xs font-medium transition-[margin,opacity] duration-200 ease-linear focus-visible:ring-2 [&>svg]:size-4 [&>svg]:shrink-0',
+			'group-data-[collapsible=icon]:-mt-8 group-data-[collapsible=icon]:opacity-0',
+			className
+		),
+		'data-slot': 'sidebar-group-label',
+		'data-sidebar': 'group-label',
+		...restProps
+	});
+</script>
+
+{#if child}
+	{@render child({ props: mergedProps })}
+{:else}
+	<div bind:this={ref} {...mergedProps}>
+		{@render children?.()}
+	</div>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group.svelte
new file mode 100644
index 0000000..eed5ace
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group.svelte
@@ -0,0 +1,21 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="sidebar-group"
+	data-sidebar="group"
+	class={cn('relative flex w-full min-w-0 flex-col p-2', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-header.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-header.svelte
new file mode 100644
index 0000000..0651550
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-header.svelte
@@ -0,0 +1,21 @@
+<script lang="ts">
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="sidebar-header"
+	data-sidebar="header"
+	class={cn('flex flex-col gap-2 p-2', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-input.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-input.svelte
new file mode 100644
index 0000000..fa57473
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-input.svelte
@@ -0,0 +1,21 @@
+<script lang="ts">
+	import type { ComponentProps } from 'svelte';
+	import { Input } from '$lib/components/ui/input/index.js';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		value = $bindable(''),
+		class: className,
+		...restProps
+	}: ComponentProps<typeof Input> = $props();
+</script>
+
+<Input
+	bind:ref
+	bind:value
+	data-slot="sidebar-input"
+	data-sidebar="input"
+	class={cn('h-8 w-full bg-background shadow-none', className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-inset.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-inset.svelte
new file mode 100644
index 0000000..f55d2f4
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-inset.svelte
@@ -0,0 +1,24 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLElement>> = $props();
+</script>
+
+<main
+	bind:this={ref}
+	data-slot="sidebar-inset"
+	class={cn(
+		'relative flex w-full flex-1 flex-col',
+		'md:peer-data-[variant=inset]:m-2 md:peer-data-[variant=inset]:ml-0 md:peer-data-[variant=inset]:rounded-xl md:peer-data-[variant=inset]:shadow-sm md:peer-data-[variant=inset]:peer-data-[state=collapsed]:ml-2',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+</main>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-action.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-action.svelte
new file mode 100644
index 0000000..ded1ffd
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-action.svelte
@@ -0,0 +1,43 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { Snippet } from 'svelte';
+	import type { HTMLButtonAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		showOnHover = false,
+		children,
+		child,
+		...restProps
+	}: WithElementRef<HTMLButtonAttributes> & {
+		child?: Snippet<[{ props: Record<string, unknown> }]>;
+		showOnHover?: boolean;
+	} = $props();
+
+	const mergedProps = $derived({
+		class: cn(
+			'text-sidebar-foreground ring-sidebar-ring hover:bg-sidebar-accent hover:text-sidebar-accent-foreground peer-hover/menu-button:text-sidebar-accent-foreground outline-hidden absolute right-1 top-1.5 flex aspect-square w-5 items-center justify-center rounded-md p-0 transition-transform focus-visible:ring-2 [&>svg]:size-4 [&>svg]:shrink-0',
+			// Increases the hit area of the button on mobile.
+			'after:absolute after:-inset-2 md:after:hidden',
+			'peer-data-[size=sm]/menu-button:top-1',
+			'peer-data-[size=default]/menu-button:top-1.5',
+			'peer-data-[size=lg]/menu-button:top-2.5',
+			'group-data-[collapsible=icon]:hidden',
+			showOnHover &&
+				'peer-data-[active=true]/menu-button:text-sidebar-accent-foreground group-focus-within/menu-item:opacity-100 group-hover/menu-item:opacity-100 data-[state=open]:opacity-100 md:opacity-0',
+			className
+		),
+		'data-slot': 'sidebar-menu-action',
+		'data-sidebar': 'menu-action',
+		...restProps
+	});
+</script>
+
+{#if child}
+	{@render child({ props: mergedProps })}
+{:else}
+	<button bind:this={ref} {...mergedProps}>
+		{@render children?.()}
+	</button>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-badge.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-badge.svelte
new file mode 100644
index 0000000..f4525a1
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-badge.svelte
@@ -0,0 +1,29 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLElement>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="sidebar-menu-badge"
+	data-sidebar="menu-badge"
+	class={cn(
+		'pointer-events-none absolute right-1 flex h-5 min-w-5 items-center justify-center rounded-md px-1 text-xs font-medium text-sidebar-foreground tabular-nums select-none',
+		'peer-hover/menu-button:text-sidebar-accent-foreground peer-data-[active=true]/menu-button:text-sidebar-accent-foreground',
+		'peer-data-[size=sm]/menu-button:top-1',
+		'peer-data-[size=default]/menu-button:top-1.5',
+		'peer-data-[size=lg]/menu-button:top-2.5',
+		'group-data-[collapsible=icon]:hidden',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-button.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-button.svelte
new file mode 100644
index 0000000..2ce0305
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-button.svelte
@@ -0,0 +1,106 @@
+<script lang="ts" module>
+	import { tv, type VariantProps } from 'tailwind-variants';
+
+	export const sidebarMenuButtonVariants = tv({
+		base: 'peer/menu-button outline-hidden ring-sidebar-ring hover:bg-sidebar-accent hover:text-sidebar-accent-foreground active:bg-sidebar-accent active:text-sidebar-accent-foreground group-has-data-[sidebar=menu-action]/menu-item:pr-8 data-[active=true]:bg-sidebar-accent data-[active=true]:text-sidebar-accent-foreground data-[state=open]:hover:bg-sidebar-accent data-[state=open]:hover:text-sidebar-accent-foreground group-data-[collapsible=icon]:size-8! group-data-[collapsible=icon]:p-2! flex w-full items-center gap-2 overflow-hidden rounded-md p-2 text-left text-sm transition-[width,height,padding] focus-visible:ring-2 disabled:pointer-events-none disabled:opacity-50 aria-disabled:pointer-events-none aria-disabled:opacity-50 data-[active=true]:font-medium [&>span:last-child]:truncate [&>svg]:size-4 [&>svg]:shrink-0',
+		variants: {
+			variant: {
+				default: 'hover:bg-sidebar-accent hover:text-sidebar-accent-foreground',
+				outline:
+					'bg-background hover:bg-sidebar-accent hover:text-sidebar-accent-foreground shadow-[0_0_0_1px_var(--sidebar-border)] hover:shadow-[0_0_0_1px_var(--sidebar-accent)]'
+			},
+			size: {
+				default: 'h-8 text-sm',
+				sm: 'h-7 text-xs',
+				lg: 'group-data-[collapsible=icon]:p-0! h-12 text-sm'
+			}
+		},
+		defaultVariants: {
+			variant: 'default',
+			size: 'default'
+		}
+	});
+
+	export type SidebarMenuButtonVariant = VariantProps<typeof sidebarMenuButtonVariants>['variant'];
+	export type SidebarMenuButtonSize = VariantProps<typeof sidebarMenuButtonVariants>['size'];
+</script>
+
+<script lang="ts">
+	import * as Tooltip from '$lib/components/ui/tooltip/index.js';
+	import {
+		cn,
+		type WithElementRef,
+		type WithoutChildrenOrChild
+	} from '$lib/components/ui/utils.js';
+	import { mergeProps } from 'bits-ui';
+	import type { ComponentProps, Snippet } from 'svelte';
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { useSidebar } from './context.svelte.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		child,
+		variant = 'default',
+		size = 'default',
+		isActive = false,
+		tooltipContent,
+		tooltipContentProps,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLButtonElement>, HTMLButtonElement> & {
+		isActive?: boolean;
+		variant?: SidebarMenuButtonVariant;
+		size?: SidebarMenuButtonSize;
+		tooltipContent?: Snippet | string;
+		tooltipContentProps?: WithoutChildrenOrChild<ComponentProps<typeof Tooltip.Content>>;
+		child?: Snippet<[{ props: Record<string, unknown> }]>;
+	} = $props();
+
+	const sidebar = useSidebar();
+
+	const buttonProps = $derived({
+		class: cn(sidebarMenuButtonVariants({ variant, size }), className),
+		'data-slot': 'sidebar-menu-button',
+		'data-sidebar': 'menu-button',
+		'data-size': size,
+		'data-active': isActive,
+		...restProps
+	});
+</script>
+
+{#snippet Button({ props }: { props?: Record<string, unknown> })}
+	{@const mergedProps = mergeProps(buttonProps, props)}
+	{#if child}
+		{@render child({ props: mergedProps })}
+	{:else}
+		<button bind:this={ref} {...mergedProps}>
+			{@render children?.()}
+		</button>
+	{/if}
+{/snippet}
+
+{#if !tooltipContent}
+	{@render Button({})}
+{:else}
+	<Tooltip.Root>
+		<Tooltip.Trigger>
+			{#snippet child({ props })}
+				{@render Button({ props })}
+			{/snippet}
+		</Tooltip.Trigger>
+
+		<Tooltip.Content
+			side="right"
+			align="center"
+			hidden={sidebar.state !== 'collapsed' || sidebar.isMobile}
+			{...tooltipContentProps}
+		>
+			{#if typeof tooltipContent === 'string'}
+				{tooltipContent}
+			{:else if tooltipContent}
+				{@render tooltipContent()}
+			{/if}
+		</Tooltip.Content>
+	</Tooltip.Root>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-item.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-item.svelte
new file mode 100644
index 0000000..5adbedd
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-item.svelte
@@ -0,0 +1,21 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLLIElement>, HTMLLIElement> = $props();
+</script>
+
+<li
+	bind:this={ref}
+	data-slot="sidebar-menu-item"
+	data-sidebar="menu-item"
+	class={cn('group/menu-item relative', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</li>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-skeleton.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-skeleton.svelte
new file mode 100644
index 0000000..2b2acd6
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-skeleton.svelte
@@ -0,0 +1,36 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import { Skeleton } from '$lib/components/ui/skeleton/index.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		showIcon = false,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLElement>> & {
+		showIcon?: boolean;
+	} = $props();
+
+	// Random width between 50% and 90%
+	const width = `${Math.floor(Math.random() * 40) + 50}%`;
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="sidebar-menu-skeleton"
+	data-sidebar="menu-skeleton"
+	class={cn('flex h-8 items-center gap-2 rounded-md px-2', className)}
+	{...restProps}
+>
+	{#if showIcon}
+		<Skeleton class="size-4 rounded-md" data-sidebar="menu-skeleton-icon" />
+	{/if}
+	<Skeleton
+		class="h-4 max-w-(--skeleton-width) flex-1"
+		data-sidebar="menu-skeleton-text"
+		style="--skeleton-width: {width};"
+	/>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub-button.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub-button.svelte
new file mode 100644
index 0000000..dabfe0f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub-button.svelte
@@ -0,0 +1,43 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { Snippet } from 'svelte';
+	import type { HTMLAnchorAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		children,
+		child,
+		class: className,
+		size = 'md',
+		isActive = false,
+		...restProps
+	}: WithElementRef<HTMLAnchorAttributes> & {
+		child?: Snippet<[{ props: Record<string, unknown> }]>;
+		size?: 'sm' | 'md';
+		isActive?: boolean;
+	} = $props();
+
+	const mergedProps = $derived({
+		class: cn(
+			'text-sidebar-foreground ring-sidebar-ring hover:bg-sidebar-accent hover:text-sidebar-accent-foreground active:bg-sidebar-accent active:text-sidebar-accent-foreground [&>svg]:text-sidebar-accent-foreground outline-hidden flex h-7 min-w-0 -translate-x-px items-center gap-2 overflow-hidden rounded-md px-2 focus-visible:ring-2 disabled:pointer-events-none disabled:opacity-50 aria-disabled:pointer-events-none aria-disabled:opacity-50 [&>span:last-child]:truncate [&>svg]:size-4 [&>svg]:shrink-0',
+			'data-[active=true]:bg-sidebar-accent data-[active=true]:text-sidebar-accent-foreground',
+			size === 'sm' && 'text-xs',
+			size === 'md' && 'text-sm',
+			'group-data-[collapsible=icon]:hidden',
+			className
+		),
+		'data-slot': 'sidebar-menu-sub-button',
+		'data-sidebar': 'menu-sub-button',
+		'data-size': size,
+		'data-active': isActive,
+		...restProps
+	});
+</script>
+
+{#if child}
+	{@render child({ props: mergedProps })}
+{:else}
+	<a bind:this={ref} {...mergedProps}>
+		{@render children?.()}
+	</a>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub-item.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub-item.svelte
new file mode 100644
index 0000000..cca870e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub-item.svelte
@@ -0,0 +1,21 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		children,
+		class: className,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLLIElement>> = $props();
+</script>
+
+<li
+	bind:this={ref}
+	data-slot="sidebar-menu-sub-item"
+	data-sidebar="menu-sub-item"
+	class={cn('group/menu-sub-item relative', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</li>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub.svelte
new file mode 100644
index 0000000..5458ced
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub.svelte
@@ -0,0 +1,25 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLUListElement>> = $props();
+</script>
+
+<ul
+	bind:this={ref}
+	data-slot="sidebar-menu-sub"
+	data-sidebar="menu-sub"
+	class={cn(
+		'mx-3.5 flex min-w-0 translate-x-px flex-col gap-1 border-l border-sidebar-border px-2.5 py-0.5',
+		'group-data-[collapsible=icon]:hidden',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+</ul>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu.svelte
new file mode 100644
index 0000000..fee96ed
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu.svelte
@@ -0,0 +1,21 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLUListElement>, HTMLUListElement> = $props();
+</script>
+
+<ul
+	bind:this={ref}
+	data-slot="sidebar-menu"
+	data-sidebar="menu"
+	class={cn('flex w-full min-w-0 flex-col gap-1', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</ul>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-provider.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-provider.svelte
new file mode 100644
index 0000000..364235a
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-provider.svelte
@@ -0,0 +1,50 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+	import {
+		SIDEBAR_COOKIE_MAX_AGE,
+		SIDEBAR_COOKIE_NAME,
+		SIDEBAR_WIDTH,
+		SIDEBAR_WIDTH_ICON
+	} from './constants.js';
+	import { setSidebar } from './context.svelte.js';
+
+	let {
+		ref = $bindable(null),
+		open = $bindable(true),
+		onOpenChange = () => {},
+		class: className,
+		style,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> & {
+		open?: boolean;
+		onOpenChange?: (open: boolean) => void;
+	} = $props();
+
+	const sidebar = setSidebar({
+		open: () => open,
+		setOpen: (value: boolean) => {
+			open = value;
+			onOpenChange(value);
+
+			// This sets the cookie to keep the sidebar state.
+			document.cookie = `${SIDEBAR_COOKIE_NAME}=${open}; path=/; max-age=${SIDEBAR_COOKIE_MAX_AGE}`;
+		}
+	});
+</script>
+
+<svelte:window onkeydown={sidebar.handleShortcutKeydown} />
+
+<div
+	data-slot="sidebar-wrapper"
+	style="--sidebar-width: {SIDEBAR_WIDTH}; --sidebar-width-icon: {SIDEBAR_WIDTH_ICON}; {style}"
+	class={cn(
+		'group/sidebar-wrapper flex min-h-svh w-full has-data-[variant=inset]:bg-sidebar',
+		className
+	)}
+	bind:this={ref}
+	{...restProps}
+>
+	{@render children?.()}
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-rail.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-rail.svelte
new file mode 100644
index 0000000..cde9307
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-rail.svelte
@@ -0,0 +1,36 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { useSidebar } from './context.svelte.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLButtonElement>, HTMLButtonElement> = $props();
+
+	const sidebar = useSidebar();
+</script>
+
+<button
+	bind:this={ref}
+	data-sidebar="rail"
+	data-slot="sidebar-rail"
+	aria-label="Toggle Sidebar"
+	tabIndex={-1}
+	onclick={sidebar.toggle}
+	title="Toggle Sidebar"
+	class={cn(
+		'absolute inset-y-0 z-20 hidden w-4 -translate-x-1/2 transition-all ease-linear group-data-[side=left]:-right-4 group-data-[side=right]:left-0 after:absolute after:inset-y-0 after:left-[calc(1/2*100%-1px)] after:w-[2px] hover:after:bg-sidebar-border sm:flex',
+		'in-data-[side=left]:cursor-w-resize in-data-[side=right]:cursor-e-resize',
+		'[[data-side=left][data-state=collapsed]_&]:cursor-e-resize [[data-side=right][data-state=collapsed]_&]:cursor-w-resize',
+		'group-data-[collapsible=offcanvas]:translate-x-0 group-data-[collapsible=offcanvas]:after:left-full hover:group-data-[collapsible=offcanvas]:bg-sidebar',
+		'[[data-side=left][data-collapsible=offcanvas]_&]:-right-2',
+		'[[data-side=right][data-collapsible=offcanvas]_&]:-left-2',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+</button>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-separator.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-separator.svelte
new file mode 100644
index 0000000..8fc2065
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-separator.svelte
@@ -0,0 +1,19 @@
+<script lang="ts">
+	import { Separator } from '$lib/components/ui/separator/index.js';
+	import { cn } from '$lib/components/ui/utils.js';
+	import type { ComponentProps } from 'svelte';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: ComponentProps<typeof Separator> = $props();
+</script>
+
+<Separator
+	bind:ref
+	data-slot="sidebar-separator"
+	data-sidebar="separator"
+	class={cn('bg-sidebar-border', className)}
+	{...restProps}
+/>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-trigger.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-trigger.svelte
new file mode 100644
index 0000000..29d3a9c
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-trigger.svelte
@@ -0,0 +1,35 @@
+<script lang="ts">
+	import { Button } from '$lib/components/ui/button/index.js';
+	import { cn } from '$lib/components/ui/utils.js';
+	import PanelLeftIcon from '@lucide/svelte/icons/panel-left';
+	import type { ComponentProps } from 'svelte';
+	import { useSidebar } from './context.svelte.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		onclick,
+		...restProps
+	}: ComponentProps<typeof Button> & {
+		onclick?: (e: MouseEvent) => void;
+	} = $props();
+
+	const sidebar = useSidebar();
+</script>
+
+<Button
+	data-sidebar="trigger"
+	data-slot="sidebar-trigger"
+	variant="ghost"
+	size="icon"
+	class={cn('size-7', className)}
+	type="button"
+	onclick={(e) => {
+		onclick?.(e);
+		sidebar.toggle();
+	}}
+	{...restProps}
+>
+	<PanelLeftIcon />
+	<span class="sr-only">Toggle Sidebar</span>
+</Button>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar.svelte
new file mode 100644
index 0000000..e2c4a75
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar.svelte
@@ -0,0 +1,101 @@
+<script lang="ts">
+	import * as Sheet from '$lib/components/ui/sheet/index.js';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+	import { SIDEBAR_WIDTH_MOBILE } from './constants.js';
+	import { useSidebar } from './context.svelte.js';
+
+	let {
+		ref = $bindable(null),
+		side = 'left',
+		variant = 'sidebar',
+		collapsible = 'offcanvas',
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLDivElement>> & {
+		side?: 'left' | 'right';
+		variant?: 'sidebar' | 'floating' | 'inset';
+		collapsible?: 'offcanvas' | 'icon' | 'none';
+	} = $props();
+
+	const sidebar = useSidebar();
+</script>
+
+{#if collapsible === 'none'}
+	<div
+		class={cn(
+			'flex h-full w-(--sidebar-width) flex-col bg-sidebar text-sidebar-foreground',
+			className
+		)}
+		bind:this={ref}
+		{...restProps}
+	>
+		{@render children?.()}
+	</div>
+{:else if sidebar.isMobile}
+	<Sheet.Root bind:open={() => sidebar.openMobile, (v) => sidebar.setOpenMobile(v)} {...restProps}>
+		<Sheet.Content
+			data-sidebar="sidebar"
+			data-slot="sidebar"
+			data-mobile="true"
+			class="z-99999 w-(--sidebar-width) bg-sidebar p-0 text-sidebar-foreground sm:z-99 [&>button]:hidden"
+			style="--sidebar-width: {SIDEBAR_WIDTH_MOBILE};"
+			{side}
+		>
+			<Sheet.Header class="sr-only">
+				<Sheet.Title>Sidebar</Sheet.Title>
+				<Sheet.Description>Displays the mobile sidebar.</Sheet.Description>
+			</Sheet.Header>
+			<div class="flex h-full w-full flex-col">
+				{@render children?.()}
+			</div>
+		</Sheet.Content>
+	</Sheet.Root>
+{:else}
+	<div
+		bind:this={ref}
+		class="group peer hidden text-sidebar-foreground md:block"
+		data-state={sidebar.state}
+		data-collapsible={sidebar.state === 'collapsed' ? collapsible : ''}
+		data-variant={variant}
+		data-side={side}
+		data-slot="sidebar"
+	>
+		<!-- This is what handles the sidebar gap on desktop -->
+		<div
+			data-slot="sidebar-gap"
+			class={cn(
+				'relative w-(--sidebar-width) bg-transparent transition-[width] duration-200 ease-linear',
+				'group-data-[collapsible=offcanvas]:w-0',
+				'group-data-[side=right]:rotate-180',
+				variant === 'floating' || variant === 'inset'
+					? 'group-data-[collapsible=icon]:w-[calc(var(--sidebar-width-icon)+(--spacing(4))+2px)]'
+					: 'group-data-[collapsible=icon]:w-(--sidebar-width-icon)'
+			)}
+		></div>
+		<div
+			data-slot="sidebar-container"
+			class={cn(
+				'fixed inset-y-0 z-999 hidden h-svh w-(--sidebar-width) transition-[left,right,width] duration-200 ease-linear md:z-0 md:flex',
+				side === 'left'
+					? 'left-0 group-data-[collapsible=offcanvas]:left-[calc(var(--sidebar-width)*-1)]'
+					: 'right-0 group-data-[collapsible=offcanvas]:right-[calc(var(--sidebar-width)*-1)]',
+				// Adjust the padding for floating and inset variants.
+				variant === 'floating' || variant === 'inset'
+					? 'p-2 group-data-[collapsible=icon]:w-[calc(var(--sidebar-width-icon)+(--spacing(4))+2px)]'
+					: 'group-data-[collapsible=icon]:w-(--sidebar-width-icon)',
+				className
+			)}
+			{...restProps}
+		>
+			<div
+				data-sidebar="sidebar"
+				data-slot="sidebar-inner"
+				class="flex h-full w-full flex-col bg-sidebar group-data-[variant=floating]:rounded-lg group-data-[variant=floating]:border group-data-[variant=floating]:border-sidebar-border group-data-[variant=floating]:shadow-sm"
+			>
+				{@render children?.()}
+			</div>
+		</div>
+	</div>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/skeleton/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/skeleton/index.ts
new file mode 100644
index 0000000..3120ce1
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/skeleton/index.ts
@@ -0,0 +1,7 @@
+import Root from './skeleton.svelte';
+
+export {
+	Root,
+	//
+	Root as Skeleton
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/skeleton/skeleton.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/skeleton/skeleton.svelte
new file mode 100644
index 0000000..62b6f80
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/skeleton/skeleton.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import { cn, type WithElementRef, type WithoutChildren } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		...restProps
+	}: WithoutChildren<WithElementRef<HTMLAttributes<HTMLDivElement>>> = $props();
+</script>
+
+<div
+	bind:this={ref}
+	data-slot="skeleton"
+	class={cn('animate-pulse rounded-md bg-accent', className)}
+	{...restProps}
+></div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/switch/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/switch/index.ts
new file mode 100644
index 0000000..129f8f5
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/switch/index.ts
@@ -0,0 +1,7 @@
+import Root from './switch.svelte';
+
+export {
+	Root,
+	//
+	Root as Switch
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/switch/switch.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/switch/switch.svelte
new file mode 100644
index 0000000..5a5975e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/switch/switch.svelte
@@ -0,0 +1,29 @@
+<script lang="ts">
+	import { Switch as SwitchPrimitive } from 'bits-ui';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		checked = $bindable(false),
+		...restProps
+	}: WithoutChildrenOrChild<SwitchPrimitive.RootProps> = $props();
+</script>
+
+<SwitchPrimitive.Root
+	bind:ref
+	bind:checked
+	data-slot="switch"
+	class={cn(
+		'peer inline-flex h-[1.15rem] w-8 shrink-0 items-center rounded-full border border-transparent shadow-xs transition-all outline-none focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50 disabled:cursor-not-allowed disabled:opacity-50 data-[state=checked]:bg-primary data-[state=unchecked]:bg-input dark:data-[state=unchecked]:bg-input/80',
+		className
+	)}
+	{...restProps}
+>
+	<SwitchPrimitive.Thumb
+		data-slot="switch-thumb"
+		class={cn(
+			'pointer-events-none block size-4 rounded-full bg-background ring-0 transition-transform data-[state=checked]:translate-x-[calc(100%-2px)] data-[state=unchecked]:translate-x-0 dark:data-[state=checked]:bg-primary-foreground dark:data-[state=unchecked]:bg-foreground'
+		)}
+	/>
+</SwitchPrimitive.Root>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/table/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/table/index.ts
new file mode 100644
index 0000000..99239ae
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/table/index.ts
@@ -0,0 +1,28 @@
+import Root from './table.svelte';
+import Body from './table-body.svelte';
+import Caption from './table-caption.svelte';
+import Cell from './table-cell.svelte';
+import Footer from './table-footer.svelte';
+import Head from './table-head.svelte';
+import Header from './table-header.svelte';
+import Row from './table-row.svelte';
+
+export {
+	Root,
+	Body,
+	Caption,
+	Cell,
+	Footer,
+	Head,
+	Header,
+	Row,
+	//
+	Root as Table,
+	Body as TableBody,
+	Caption as TableCaption,
+	Cell as TableCell,
+	Footer as TableFooter,
+	Head as TableHead,
+	Header as TableHeader,
+	Row as TableRow
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-body.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-body.svelte
new file mode 100644
index 0000000..f8df65c
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-body.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLTableSectionElement>> = $props();
+</script>
+
+<tbody
+	bind:this={ref}
+	data-slot="table-body"
+	class={cn('[&_tr:last-child]:border-0', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</tbody>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-caption.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-caption.svelte
new file mode 100644
index 0000000..0fdcc64
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-caption.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLElement>> = $props();
+</script>
+
+<caption
+	bind:this={ref}
+	data-slot="table-caption"
+	class={cn('mt-4 text-sm text-muted-foreground', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</caption>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-cell.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-cell.svelte
new file mode 100644
index 0000000..4506fdf
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-cell.svelte
@@ -0,0 +1,23 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLTdAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLTdAttributes> = $props();
+</script>
+
+<td
+	bind:this={ref}
+	data-slot="table-cell"
+	class={cn(
+		'bg-clip-padding p-2 align-middle whitespace-nowrap [&:has([role=checkbox])]:pe-0',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+</td>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-footer.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-footer.svelte
new file mode 100644
index 0000000..77e4a64
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-footer.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLTableSectionElement>> = $props();
+</script>
+
+<tfoot
+	bind:this={ref}
+	data-slot="table-footer"
+	class={cn('border-t bg-muted/50 font-medium [&>tr]:last:border-b-0', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</tfoot>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-head.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-head.svelte
new file mode 100644
index 0000000..c1c57ad
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-head.svelte
@@ -0,0 +1,23 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLThAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLThAttributes> = $props();
+</script>
+
+<th
+	bind:this={ref}
+	data-slot="table-head"
+	class={cn(
+		'h-10 bg-clip-padding px-2 text-left align-middle font-medium whitespace-nowrap text-foreground [&:has([role=checkbox])]:pe-0',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+</th>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-header.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-header.svelte
new file mode 100644
index 0000000..eb36673
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-header.svelte
@@ -0,0 +1,20 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLTableSectionElement>> = $props();
+</script>
+
+<thead
+	bind:this={ref}
+	data-slot="table-header"
+	class={cn('[&_tr]:border-b', className)}
+	{...restProps}
+>
+	{@render children?.()}
+</thead>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-row.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-row.svelte
new file mode 100644
index 0000000..4131d36
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table-row.svelte
@@ -0,0 +1,23 @@
+<script lang="ts">
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+	import type { HTMLAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLAttributes<HTMLTableRowElement>> = $props();
+</script>
+
+<tr
+	bind:this={ref}
+	data-slot="table-row"
+	class={cn(
+		'border-b transition-colors data-[state=selected]:bg-muted hover:[&,&>svelte-css-wrapper]:[&>th,td]:bg-muted/50',
+		className
+	)}
+	{...restProps}
+>
+	{@render children?.()}
+</tr>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/table/table.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table.svelte
new file mode 100644
index 0000000..c11a6a6
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/table/table.svelte
@@ -0,0 +1,22 @@
+<script lang="ts">
+	import type { HTMLTableAttributes } from 'svelte/elements';
+	import { cn, type WithElementRef } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		children,
+		...restProps
+	}: WithElementRef<HTMLTableAttributes> = $props();
+</script>
+
+<div data-slot="table-container" class="relative w-full overflow-x-auto">
+	<table
+		bind:this={ref}
+		data-slot="table"
+		class={cn('w-full caption-bottom text-sm', className)}
+		{...restProps}
+	>
+		{@render children?.()}
+	</table>
+</div>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/textarea/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/textarea/index.ts
new file mode 100644
index 0000000..9ccb3bf
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/textarea/index.ts
@@ -0,0 +1,7 @@
+import Root from './textarea.svelte';
+
+export {
+	Root,
+	//
+	Root as Textarea
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/textarea/textarea.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/textarea/textarea.svelte
new file mode 100644
index 0000000..bf83882
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/textarea/textarea.svelte
@@ -0,0 +1,22 @@
+<script lang="ts">
+	import { cn, type WithElementRef, type WithoutChildren } from '$lib/components/ui/utils';
+	import type { HTMLTextareaAttributes } from 'svelte/elements';
+
+	let {
+		ref = $bindable(null),
+		value = $bindable(),
+		class: className,
+		...restProps
+	}: WithoutChildren<WithElementRef<HTMLTextareaAttributes>> = $props();
+</script>
+
+<textarea
+	bind:this={ref}
+	data-slot="textarea"
+	class={cn(
+		'flex field-sizing-content min-h-16 w-full rounded-md border border-input bg-transparent px-3 py-2 text-base shadow-xs transition-[color,box-shadow] outline-none placeholder:text-muted-foreground focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50 disabled:cursor-not-allowed disabled:opacity-50 aria-invalid:border-destructive aria-invalid:ring-destructive/20 md:text-sm dark:bg-input/30 dark:aria-invalid:ring-destructive/40',
+		className
+	)}
+	bind:value
+	{...restProps}
+></textarea>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/index.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/index.ts
new file mode 100644
index 0000000..273d831
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/index.ts
@@ -0,0 +1,21 @@
+import { Tooltip as TooltipPrimitive } from 'bits-ui';
+import Trigger from './tooltip-trigger.svelte';
+import Content from './tooltip-content.svelte';
+
+const Root = TooltipPrimitive.Root;
+const Provider = TooltipPrimitive.Provider;
+const Portal = TooltipPrimitive.Portal;
+
+export {
+	Root,
+	Trigger,
+	Content,
+	Provider,
+	Portal,
+	//
+	Root as Tooltip,
+	Content as TooltipContent,
+	Trigger as TooltipTrigger,
+	Provider as TooltipProvider,
+	Portal as TooltipPortal
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/tooltip-content.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/tooltip-content.svelte
new file mode 100644
index 0000000..72ea93a
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/tooltip-content.svelte
@@ -0,0 +1,47 @@
+<script lang="ts">
+	import { Tooltip as TooltipPrimitive } from 'bits-ui';
+	import { cn } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		sideOffset = 0,
+		side = 'top',
+		children,
+		arrowClasses,
+		...restProps
+	}: TooltipPrimitive.ContentProps & {
+		arrowClasses?: string;
+	} = $props();
+</script>
+
+<TooltipPrimitive.Portal>
+	<TooltipPrimitive.Content
+		bind:ref
+		data-slot="tooltip-content"
+		{sideOffset}
+		{side}
+		class={cn(
+			'z-50 w-fit origin-(--bits-tooltip-content-transform-origin) animate-in rounded-md bg-primary px-3 py-1.5 text-xs text-balance text-primary-foreground fade-in-0 zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95',
+			className
+		)}
+		{...restProps}
+	>
+		{@render children?.()}
+		<TooltipPrimitive.Arrow>
+			{#snippet child({ props })}
+				<div
+					class={cn(
+						'z-50 size-2.5 rotate-45 rounded-[2px] bg-primary',
+						'data-[side=top]:translate-x-1/2 data-[side=top]:translate-y-[calc(-50%_+_2px)]',
+						'data-[side=bottom]:-translate-x-1/2 data-[side=bottom]:-translate-y-[calc(-50%_+_1px)]',
+						'data-[side=right]:translate-x-[calc(50%_+_2px)] data-[side=right]:translate-y-1/2',
+						'data-[side=left]:-translate-y-[calc(50%_-_3px)]',
+						arrowClasses
+					)}
+					{...props}
+				></div>
+			{/snippet}
+		</TooltipPrimitive.Arrow>
+	</TooltipPrimitive.Content>
+</TooltipPrimitive.Portal>
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/tooltip-trigger.svelte b/llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/tooltip-trigger.svelte
new file mode 100644
index 0000000..5631d1b
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/tooltip-trigger.svelte
@@ -0,0 +1,7 @@
+<script lang="ts">
+	import { Tooltip as TooltipPrimitive } from 'bits-ui';
+
+	let { ref = $bindable(null), ...restProps }: TooltipPrimitive.TriggerProps = $props();
+</script>
+
+<TooltipPrimitive.Trigger bind:ref data-slot="tooltip-trigger" {...restProps} />
diff --git a/llama.cpp/tools/server/webui/src/lib/components/ui/utils.ts b/llama.cpp/tools/server/webui/src/lib/components/ui/utils.ts
new file mode 100644
index 0000000..f92bfcb
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/components/ui/utils.ts
@@ -0,0 +1,13 @@
+import { clsx, type ClassValue } from 'clsx';
+import { twMerge } from 'tailwind-merge';
+
+export function cn(...inputs: ClassValue[]) {
+	return twMerge(clsx(inputs));
+}
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export type WithoutChild<T> = T extends { child?: any } ? Omit<T, 'child'> : T;
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export type WithoutChildren<T> = T extends { children?: any } ? Omit<T, 'children'> : T;
+export type WithoutChildrenOrChild<T> = WithoutChildren<WithoutChild<T>>;
+export type WithElementRef<T, U extends HTMLElement = HTMLElement> = T & { ref?: U | null };
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/auto-scroll.ts b/llama.cpp/tools/server/webui/src/lib/constants/auto-scroll.ts
new file mode 100644
index 0000000..098f435
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/auto-scroll.ts
@@ -0,0 +1,3 @@
+export const AUTO_SCROLL_INTERVAL = 100;
+export const INITIAL_SCROLL_DELAY = 50;
+export const AUTO_SCROLL_AT_BOTTOM_THRESHOLD = 10;
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/binary-detection.ts b/llama.cpp/tools/server/webui/src/lib/constants/binary-detection.ts
new file mode 100644
index 0000000..a4440fd
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/binary-detection.ts
@@ -0,0 +1,14 @@
+export interface BinaryDetectionOptions {
+	/** Number of characters to check from the beginning of the file */
+	prefixLength: number;
+	/** Maximum ratio of suspicious characters allowed (0.0 to 1.0) */
+	suspiciousCharThresholdRatio: number;
+	/** Maximum absolute number of null bytes allowed */
+	maxAbsoluteNullBytes: number;
+}
+
+export const DEFAULT_BINARY_DETECTION_OPTIONS: BinaryDetectionOptions = {
+	prefixLength: 1024 * 10, // Check the first 10KB of the string
+	suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars
+	maxAbsoluteNullBytes: 2
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/default-context.ts b/llama.cpp/tools/server/webui/src/lib/constants/default-context.ts
new file mode 100644
index 0000000..78f3111
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/default-context.ts
@@ -0,0 +1 @@
+export const DEFAULT_CONTEXT = 4096;
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/floating-ui-constraints.ts b/llama.cpp/tools/server/webui/src/lib/constants/floating-ui-constraints.ts
new file mode 100644
index 0000000..003fc77
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/floating-ui-constraints.ts
@@ -0,0 +1,2 @@
+export const VIEWPORT_GUTTER = 8;
+export const MENU_OFFSET = 6;
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/icons.ts b/llama.cpp/tools/server/webui/src/lib/constants/icons.ts
new file mode 100644
index 0000000..1e88ab5
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/icons.ts
@@ -0,0 +1,32 @@
+/**
+ * Icon mappings for file types and model modalities
+ * Centralized configuration to ensure consistent icon usage across the app
+ */
+
+import {
+	File as FileIcon,
+	FileText as FileTextIcon,
+	Image as ImageIcon,
+	Eye as VisionIcon,
+	Mic as AudioIcon
+} from '@lucide/svelte';
+import { FileTypeCategory, ModelModality } from '$lib/enums';
+
+export const FILE_TYPE_ICONS = {
+	[FileTypeCategory.IMAGE]: ImageIcon,
+	[FileTypeCategory.AUDIO]: AudioIcon,
+	[FileTypeCategory.TEXT]: FileTextIcon,
+	[FileTypeCategory.PDF]: FileIcon
+} as const;
+
+export const DEFAULT_FILE_ICON = FileIcon;
+
+export const MODALITY_ICONS = {
+	[ModelModality.VISION]: VisionIcon,
+	[ModelModality.AUDIO]: AudioIcon
+} as const;
+
+export const MODALITY_LABELS = {
+	[ModelModality.VISION]: 'Vision',
+	[ModelModality.AUDIO]: 'Audio'
+} as const;
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/input-classes.ts b/llama.cpp/tools/server/webui/src/lib/constants/input-classes.ts
new file mode 100644
index 0000000..a541cfc
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/input-classes.ts
@@ -0,0 +1,6 @@
+export const INPUT_CLASSES = `
+    bg-muted/70 dark:bg-muted/85
+    border border-border/30 focus-within:border-border  dark:border-border/20 dark:focus-within:border-border
+    outline-none
+    text-foreground
+`;
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/latex-protection.ts b/llama.cpp/tools/server/webui/src/lib/constants/latex-protection.ts
new file mode 100644
index 0000000..27c88e7
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/latex-protection.ts
@@ -0,0 +1,35 @@
+/**
+ * Matches common Markdown code blocks to exclude them from further processing (e.g. LaTeX).
+ * - Fenced: ```...```
+ * - Inline: `...` (does NOT support nested backticks or multi-backtick syntax)
+ *
+ * Note: This pattern does not handle advanced cases like:
+ *       `` `code with `backticks` `` or \\``...\\``
+ */
+export const CODE_BLOCK_REGEXP = /(```[\s\S]*?```|`[^`\n]+`)/g;
+
+/**
+ * Matches LaTeX math delimiters \(...\) and \[...\] only when not preceded by a backslash (i.e., not escaped),
+ * while also capturing code blocks (```, `...`) so they can be skipped during processing.
+ *
+ * Uses negative lookbehind `(?<!\\)` to avoid matching \\( or \\[.
+ * Using the look‑behind pattern `(?<!\\)` we skip matches
+ * that are preceded by a backslash, e.g.
+ * `Definitions\\(also called macros)` (title of chapter 20 in The TeXbook)
+ * or `\\[4pt]` (LaTeX line-break).
+ *
+ * group 1: code-block
+ * group 2: square-bracket
+ * group 3: round-bracket
+ */
+export const LATEX_MATH_AND_CODE_PATTERN =
+	/(```[\S\s]*?```|`.*?`)|(?<!\\)\\\[([\S\s]*?[^\\])\\]|(?<!\\)\\\((.*?)\\\)/g;
+
+/** Regex to capture the content of a $$...\\\\...$$ block (display-formula with line-break) */
+export const LATEX_LINEBREAK_REGEXP = /\$\$([\s\S]*?\\\\[\s\S]*?)\$\$/;
+
+/** map from mchem-regexp to replacement */
+export const MHCHEM_PATTERN_MAP: readonly [RegExp, string][] = [
+	[/(\s)\$\\ce{/g, '$1$\\\\ce{'],
+	[/(\s)\$\\pu{/g, '$1$\\\\pu{']
+] as const;
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/literal-html.ts b/llama.cpp/tools/server/webui/src/lib/constants/literal-html.ts
new file mode 100644
index 0000000..ed1b0cf
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/literal-html.ts
@@ -0,0 +1,15 @@
+export const LINE_BREAK = /\r?\n/;
+
+export const PHRASE_PARENTS = new Set([
+	'paragraph',
+	'heading',
+	'emphasis',
+	'strong',
+	'delete',
+	'link',
+	'linkReference',
+	'tableCell'
+]);
+
+export const NBSP = '\u00a0';
+export const TAB_AS_SPACES = NBSP.repeat(4);
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/localstorage-keys.ts b/llama.cpp/tools/server/webui/src/lib/constants/localstorage-keys.ts
new file mode 100644
index 0000000..919b6ea
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/localstorage-keys.ts
@@ -0,0 +1,2 @@
+export const CONFIG_LOCALSTORAGE_KEY = 'LlamaCppWebui.config';
+export const USER_OVERRIDES_LOCALSTORAGE_KEY = 'LlamaCppWebui.userOverrides';
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/max-bundle-size.ts b/llama.cpp/tools/server/webui/src/lib/constants/max-bundle-size.ts
new file mode 100644
index 0000000..e04348f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/max-bundle-size.ts
@@ -0,0 +1 @@
+export const MAX_BUNDLE_SIZE = 2 * 1024 * 1024;
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/precision.ts b/llama.cpp/tools/server/webui/src/lib/constants/precision.ts
new file mode 100644
index 0000000..8df5c4f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/precision.ts
@@ -0,0 +1,2 @@
+export const PRECISION_MULTIPLIER = 1000000;
+export const PRECISION_DECIMAL_PLACES = 6;
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/processing-info.ts b/llama.cpp/tools/server/webui/src/lib/constants/processing-info.ts
new file mode 100644
index 0000000..7264392
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/processing-info.ts
@@ -0,0 +1 @@
+export const PROCESSING_INFO_TIMEOUT = 2000;
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts b/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts
new file mode 100644
index 0000000..cac48a5
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts
@@ -0,0 +1,117 @@
+export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> = {
+	// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
+	// Do not use nested objects, keep it single level. Prefix the key if you need to group them.
+	apiKey: '',
+	systemMessage: '',
+	showSystemMessage: true,
+	theme: 'system',
+	showThoughtInProgress: false,
+	showToolCalls: false,
+	disableReasoningFormat: false,
+	keepStatsVisible: false,
+	showMessageStats: true,
+	askForTitleConfirmation: false,
+	pasteLongTextToFileLen: 2500,
+	copyTextAttachmentsAsPlainText: false,
+	pdfAsImage: false,
+	disableAutoScroll: false,
+	renderUserContentAsMarkdown: false,
+	alwaysShowSidebarOnDesktop: false,
+	autoShowSidebarOnNewChat: true,
+	autoMicOnEmpty: false,
+	// make sure these default values are in sync with `common.h`
+	samplers: 'top_k;typ_p;top_p;min_p;temperature',
+	backend_sampling: false,
+	temperature: 0.8,
+	dynatemp_range: 0.0,
+	dynatemp_exponent: 1.0,
+	top_k: 40,
+	top_p: 0.95,
+	min_p: 0.05,
+	xtc_probability: 0.0,
+	xtc_threshold: 0.1,
+	typ_p: 1.0,
+	repeat_last_n: 64,
+	repeat_penalty: 1.0,
+	presence_penalty: 0.0,
+	frequency_penalty: 0.0,
+	dry_multiplier: 0.0,
+	dry_base: 1.75,
+	dry_allowed_length: 2,
+	dry_penalty_last_n: -1,
+	max_tokens: -1,
+	custom: '', // custom json-stringified object
+	// experimental features
+	pyInterpreterEnabled: false,
+	enableContinueGeneration: false
+};
+
+export const SETTING_CONFIG_INFO: Record<string, string> = {
+	apiKey: 'Set the API Key if you are using <code>--api-key</code> option for the server.',
+	systemMessage: 'The starting message that defines how model should behave.',
+	showSystemMessage: 'Display the system message at the top of each conversation.',
+	theme:
+		'Choose the color theme for the interface. You can choose between System (follows your device settings), Light, or Dark.',
+	pasteLongTextToFileLen:
+		'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.',
+	copyTextAttachmentsAsPlainText:
+		'When copying a message with text attachments, combine them into a single plain text string instead of a special format that can be pasted back as attachments.',
+	samplers:
+		'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature',
+	backend_sampling:
+		'Enable backend-based samplers. When enabled, supported samplers run on the accelerator backend for faster sampling.',
+	temperature:
+		'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
+	dynatemp_range:
+		'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
+	dynatemp_exponent:
+		'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
+	top_k: 'Keeps only k top tokens.',
+	top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
+	min_p:
+		'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
+	xtc_probability:
+		'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
+	xtc_threshold:
+		'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
+	typ_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
+	repeat_last_n: 'Last n tokens to consider for penalizing repetition',
+	repeat_penalty: 'Controls the repetition of token sequences in the generated text',
+	presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
+	frequency_penalty: 'Limits tokens based on how often they appear in the output.',
+	dry_multiplier:
+		'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
+	dry_base:
+		'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
+	dry_allowed_length:
+		'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
+	dry_penalty_last_n:
+		'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
+	max_tokens: 'The maximum number of token per output. Use -1 for infinite (no limit).',
+	custom: 'Custom JSON parameters to send to the API. Must be valid JSON format.',
+	showThoughtInProgress: 'Expand thought process by default when generating messages.',
+	showToolCalls:
+		'Display tool call labels and payloads from Harmony-compatible delta.tool_calls data below assistant messages.',
+	disableReasoningFormat:
+		'Show raw LLM output without backend parsing and frontend Markdown rendering to inspect streaming across different models.',
+	keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
+	showMessageStats:
+		'Display generation statistics (tokens/second, token count, duration) below each assistant message.',
+	askForTitleConfirmation:
+		'Ask for confirmation before automatically changing conversation title when editing the first message.',
+	pdfAsImage:
+		'Parse PDF as image instead of text. Automatically falls back to text processing for non-vision models.',
+	disableAutoScroll:
+		'Disable automatic scrolling while messages stream so you can control the viewport position manually.',
+	renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.',
+	alwaysShowSidebarOnDesktop:
+		'Always keep the sidebar visible on desktop instead of auto-hiding it.',
+	autoShowSidebarOnNewChat:
+		'Automatically show sidebar when starting a new chat. Disable to keep the sidebar hidden until you click on it.',
+	autoMicOnEmpty:
+		'Automatically show microphone button instead of send button when textarea is empty for models with audio modality support.',
+	pyInterpreterEnabled:
+		'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.',
+	enableContinueGeneration:
+		'Enable "Continue" button for assistant messages. Currently works only with non-reasoning models.'
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/supported-file-types.ts b/llama.cpp/tools/server/webui/src/lib/constants/supported-file-types.ts
new file mode 100644
index 0000000..0d955ad
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/supported-file-types.ts
@@ -0,0 +1,217 @@
+/**
+ * Comprehensive dictionary of all supported file types in webui
+ * Organized by category with TypeScript enums for better type safety
+ */
+
+import {
+	FileExtensionAudio,
+	FileExtensionImage,
+	FileExtensionPdf,
+	FileExtensionText,
+	FileTypeAudio,
+	FileTypeImage,
+	FileTypePdf,
+	FileTypeText,
+	MimeTypeAudio,
+	MimeTypeImage,
+	MimeTypeApplication,
+	MimeTypeText
+} from '$lib/enums';
+
+// File type configuration using enums
+export const AUDIO_FILE_TYPES = {
+	[FileTypeAudio.MP3]: {
+		extensions: [FileExtensionAudio.MP3],
+		mimeTypes: [MimeTypeAudio.MP3_MPEG, MimeTypeAudio.MP3]
+	},
+	[FileTypeAudio.WAV]: {
+		extensions: [FileExtensionAudio.WAV],
+		mimeTypes: [MimeTypeAudio.WAV]
+	}
+} as const;
+
+export const IMAGE_FILE_TYPES = {
+	[FileTypeImage.JPEG]: {
+		extensions: [FileExtensionImage.JPG, FileExtensionImage.JPEG],
+		mimeTypes: [MimeTypeImage.JPEG]
+	},
+	[FileTypeImage.PNG]: {
+		extensions: [FileExtensionImage.PNG],
+		mimeTypes: [MimeTypeImage.PNG]
+	},
+	[FileTypeImage.GIF]: {
+		extensions: [FileExtensionImage.GIF],
+		mimeTypes: [MimeTypeImage.GIF]
+	},
+	[FileTypeImage.WEBP]: {
+		extensions: [FileExtensionImage.WEBP],
+		mimeTypes: [MimeTypeImage.WEBP]
+	},
+	[FileTypeImage.SVG]: {
+		extensions: [FileExtensionImage.SVG],
+		mimeTypes: [MimeTypeImage.SVG]
+	}
+} as const;
+
+export const PDF_FILE_TYPES = {
+	[FileTypePdf.PDF]: {
+		extensions: [FileExtensionPdf.PDF],
+		mimeTypes: [MimeTypeApplication.PDF]
+	}
+} as const;
+
+export const TEXT_FILE_TYPES = {
+	[FileTypeText.PLAIN_TEXT]: {
+		extensions: [FileExtensionText.TXT],
+		mimeTypes: [MimeTypeText.PLAIN]
+	},
+	[FileTypeText.MARKDOWN]: {
+		extensions: [FileExtensionText.MD],
+		mimeTypes: [MimeTypeText.MARKDOWN]
+	},
+	[FileTypeText.ASCIIDOC]: {
+		extensions: [FileExtensionText.ADOC],
+		mimeTypes: [MimeTypeText.ASCIIDOC]
+	},
+	[FileTypeText.JAVASCRIPT]: {
+		extensions: [FileExtensionText.JS],
+		mimeTypes: [MimeTypeText.JAVASCRIPT, MimeTypeText.JAVASCRIPT_APP]
+	},
+	[FileTypeText.TYPESCRIPT]: {
+		extensions: [FileExtensionText.TS],
+		mimeTypes: [MimeTypeText.TYPESCRIPT]
+	},
+	[FileTypeText.JSX]: {
+		extensions: [FileExtensionText.JSX],
+		mimeTypes: [MimeTypeText.JSX]
+	},
+	[FileTypeText.TSX]: {
+		extensions: [FileExtensionText.TSX],
+		mimeTypes: [MimeTypeText.TSX]
+	},
+	[FileTypeText.CSS]: {
+		extensions: [FileExtensionText.CSS],
+		mimeTypes: [MimeTypeText.CSS]
+	},
+	[FileTypeText.HTML]: {
+		extensions: [FileExtensionText.HTML, FileExtensionText.HTM],
+		mimeTypes: [MimeTypeText.HTML]
+	},
+	[FileTypeText.JSON]: {
+		extensions: [FileExtensionText.JSON],
+		mimeTypes: [MimeTypeText.JSON]
+	},
+	[FileTypeText.XML]: {
+		extensions: [FileExtensionText.XML],
+		mimeTypes: [MimeTypeText.XML_TEXT, MimeTypeText.XML_APP]
+	},
+	[FileTypeText.YAML]: {
+		extensions: [FileExtensionText.YAML, FileExtensionText.YML],
+		mimeTypes: [MimeTypeText.YAML_TEXT, MimeTypeText.YAML_APP]
+	},
+	[FileTypeText.CSV]: {
+		extensions: [FileExtensionText.CSV],
+		mimeTypes: [MimeTypeText.CSV]
+	},
+	[FileTypeText.LOG]: {
+		extensions: [FileExtensionText.LOG],
+		mimeTypes: [MimeTypeText.PLAIN]
+	},
+	[FileTypeText.PYTHON]: {
+		extensions: [FileExtensionText.PY],
+		mimeTypes: [MimeTypeText.PYTHON]
+	},
+	[FileTypeText.JAVA]: {
+		extensions: [FileExtensionText.JAVA],
+		mimeTypes: [MimeTypeText.JAVA]
+	},
+	[FileTypeText.CPP]: {
+		extensions: [
+			FileExtensionText.CPP,
+			FileExtensionText.C,
+			FileExtensionText.H,
+			FileExtensionText.HPP
+		],
+		mimeTypes: [MimeTypeText.CPP_SRC, MimeTypeText.CPP_HDR, MimeTypeText.C_SRC, MimeTypeText.C_HDR]
+	},
+	[FileTypeText.PHP]: {
+		extensions: [FileExtensionText.PHP],
+		mimeTypes: [MimeTypeText.PHP]
+	},
+	[FileTypeText.RUBY]: {
+		extensions: [FileExtensionText.RB],
+		mimeTypes: [MimeTypeText.RUBY]
+	},
+	[FileTypeText.GO]: {
+		extensions: [FileExtensionText.GO],
+		mimeTypes: [MimeTypeText.GO]
+	},
+	[FileTypeText.RUST]: {
+		extensions: [FileExtensionText.RS],
+		mimeTypes: [MimeTypeText.RUST]
+	},
+	[FileTypeText.SHELL]: {
+		extensions: [FileExtensionText.SH, FileExtensionText.BAT],
+		mimeTypes: [MimeTypeText.SHELL, MimeTypeText.BAT]
+	},
+	[FileTypeText.SQL]: {
+		extensions: [FileExtensionText.SQL],
+		mimeTypes: [MimeTypeText.SQL]
+	},
+	[FileTypeText.R]: {
+		extensions: [FileExtensionText.R],
+		mimeTypes: [MimeTypeText.R]
+	},
+	[FileTypeText.SCALA]: {
+		extensions: [FileExtensionText.SCALA],
+		mimeTypes: [MimeTypeText.SCALA]
+	},
+	[FileTypeText.KOTLIN]: {
+		extensions: [FileExtensionText.KT],
+		mimeTypes: [MimeTypeText.KOTLIN]
+	},
+	[FileTypeText.SWIFT]: {
+		extensions: [FileExtensionText.SWIFT],
+		mimeTypes: [MimeTypeText.SWIFT]
+	},
+	[FileTypeText.DART]: {
+		extensions: [FileExtensionText.DART],
+		mimeTypes: [MimeTypeText.DART]
+	},
+	[FileTypeText.VUE]: {
+		extensions: [FileExtensionText.VUE],
+		mimeTypes: [MimeTypeText.VUE]
+	},
+	[FileTypeText.SVELTE]: {
+		extensions: [FileExtensionText.SVELTE],
+		mimeTypes: [MimeTypeText.SVELTE]
+	},
+	[FileTypeText.LATEX]: {
+		extensions: [FileExtensionText.TEX],
+		mimeTypes: [MimeTypeText.LATEX, MimeTypeText.TEX, MimeTypeText.TEX_APP]
+	},
+	[FileTypeText.BIBTEX]: {
+		extensions: [FileExtensionText.BIB],
+		mimeTypes: [MimeTypeText.BIBTEX]
+	},
+	[FileTypeText.CUDA]: {
+		extensions: [FileExtensionText.CU, FileExtensionText.CUH],
+		mimeTypes: [MimeTypeText.CUDA]
+	},
+	[FileTypeText.VULKAN]: {
+		extensions: [FileExtensionText.COMP],
+		mimeTypes: [MimeTypeText.PLAIN]
+	},
+	[FileTypeText.HASKELL]: {
+		extensions: [FileExtensionText.HS],
+		mimeTypes: [MimeTypeText.HASKELL]
+	},
+	[FileTypeText.CSHARP]: {
+		extensions: [FileExtensionText.CS],
+		mimeTypes: [MimeTypeText.CSHARP]
+	},
+	[FileTypeText.PROPERTIES]: {
+		extensions: [FileExtensionText.PROPERTIES],
+		mimeTypes: [MimeTypeText.PROPERTIES]
+	}
+} as const;
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/table-html-restorer.ts b/llama.cpp/tools/server/webui/src/lib/constants/table-html-restorer.ts
new file mode 100644
index 0000000..e5d5b12
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/table-html-restorer.ts
@@ -0,0 +1,20 @@
+/**
+ * Matches <br>, <br/>, <br /> tags (case-insensitive).
+ * Used to detect line breaks in table cell text content.
+ */
+export const BR_PATTERN = /<br\s*\/?\s*>/gi;
+
+/**
+ * Matches a complete <ul>...</ul> block.
+ * Captures the inner content (group 1) for further <li> extraction.
+ * Case-insensitive, allows multiline content.
+ */
+export const LIST_PATTERN = /^<ul>([\s\S]*)<\/ul>$/i;
+
+/**
+ * Matches individual <li>...</li> elements within a list.
+ * Captures the inner content (group 1) of each list item.
+ * Non-greedy to handle multiple consecutive items.
+ * Case-insensitive, allows multiline content.
+ */
+export const LI_PATTERN = /<li>([\s\S]*?)<\/li>/gi;
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/tooltip-config.ts b/llama.cpp/tools/server/webui/src/lib/constants/tooltip-config.ts
new file mode 100644
index 0000000..3c30c8c
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/tooltip-config.ts
@@ -0,0 +1 @@
+export const TOOLTIP_DELAY_DURATION = 100;
diff --git a/llama.cpp/tools/server/webui/src/lib/constants/viewport.ts b/llama.cpp/tools/server/webui/src/lib/constants/viewport.ts
new file mode 100644
index 0000000..26e202c
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/constants/viewport.ts
@@ -0,0 +1 @@
+export const DEFAULT_MOBILE_BREAKPOINT = 768;
diff --git a/llama.cpp/tools/server/webui/src/lib/enums/attachment.ts b/llama.cpp/tools/server/webui/src/lib/enums/attachment.ts
new file mode 100644
index 0000000..7c7d0da
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/enums/attachment.ts
@@ -0,0 +1,10 @@
+/**
+ * Attachment type enum for database message extras
+ */
+export enum AttachmentType {
+	AUDIO = 'AUDIO',
+	IMAGE = 'IMAGE',
+	PDF = 'PDF',
+	TEXT = 'TEXT',
+	LEGACY_CONTEXT = 'context' // Legacy attachment type for backward compatibility
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/enums/chat.ts b/llama.cpp/tools/server/webui/src/lib/enums/chat.ts
new file mode 100644
index 0000000..2b9eb7b
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/enums/chat.ts
@@ -0,0 +1,4 @@
+export enum ChatMessageStatsView {
+	GENERATION = 'generation',
+	READING = 'reading'
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/enums/files.ts b/llama.cpp/tools/server/webui/src/lib/enums/files.ts
new file mode 100644
index 0000000..a4f079d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/enums/files.ts
@@ -0,0 +1,206 @@
+/**
+ * Comprehensive dictionary of all supported file types in webui
+ * Organized by category with TypeScript enums for better type safety
+ */
+
+// File type category enum
+export enum FileTypeCategory {
+	IMAGE = 'image',
+	AUDIO = 'audio',
+	PDF = 'pdf',
+	TEXT = 'text'
+}
+
+// Specific file type enums for each category
+export enum FileTypeImage {
+	JPEG = 'jpeg',
+	PNG = 'png',
+	GIF = 'gif',
+	WEBP = 'webp',
+	SVG = 'svg'
+}
+
+export enum FileTypeAudio {
+	MP3 = 'mp3',
+	WAV = 'wav',
+	WEBM = 'webm'
+}
+
+export enum FileTypePdf {
+	PDF = 'pdf'
+}
+
+export enum FileTypeText {
+	PLAIN_TEXT = 'plainText',
+	MARKDOWN = 'md',
+	ASCIIDOC = 'asciidoc',
+	JAVASCRIPT = 'js',
+	TYPESCRIPT = 'ts',
+	JSX = 'jsx',
+	TSX = 'tsx',
+	CSS = 'css',
+	HTML = 'html',
+	JSON = 'json',
+	XML = 'xml',
+	YAML = 'yaml',
+	CSV = 'csv',
+	LOG = 'log',
+	PYTHON = 'python',
+	JAVA = 'java',
+	CPP = 'cpp',
+	PHP = 'php',
+	RUBY = 'ruby',
+	GO = 'go',
+	RUST = 'rust',
+	SHELL = 'shell',
+	SQL = 'sql',
+	R = 'r',
+	SCALA = 'scala',
+	KOTLIN = 'kotlin',
+	SWIFT = 'swift',
+	DART = 'dart',
+	VUE = 'vue',
+	SVELTE = 'svelte',
+	LATEX = 'latex',
+	BIBTEX = 'bibtex',
+	CUDA = 'cuda',
+	VULKAN = 'vulkan',
+	HASKELL = 'haskell',
+	CSHARP = 'csharp',
+	PROPERTIES = 'properties'
+}
+
+// File extension enums
+export enum FileExtensionImage {
+	JPG = '.jpg',
+	JPEG = '.jpeg',
+	PNG = '.png',
+	GIF = '.gif',
+	WEBP = '.webp',
+	SVG = '.svg'
+}
+
+export enum FileExtensionAudio {
+	MP3 = '.mp3',
+	WAV = '.wav'
+}
+
+export enum FileExtensionPdf {
+	PDF = '.pdf'
+}
+
+export enum FileExtensionText {
+	TXT = '.txt',
+	MD = '.md',
+	ADOC = '.adoc',
+	JS = '.js',
+	TS = '.ts',
+	JSX = '.jsx',
+	TSX = '.tsx',
+	CSS = '.css',
+	HTML = '.html',
+	HTM = '.htm',
+	JSON = '.json',
+	XML = '.xml',
+	YAML = '.yaml',
+	YML = '.yml',
+	CSV = '.csv',
+	LOG = '.log',
+	PY = '.py',
+	JAVA = '.java',
+	CPP = '.cpp',
+	C = '.c',
+	H = '.h',
+	PHP = '.php',
+	RB = '.rb',
+	GO = '.go',
+	RS = '.rs',
+	SH = '.sh',
+	BAT = '.bat',
+	SQL = '.sql',
+	R = '.r',
+	SCALA = '.scala',
+	KT = '.kt',
+	SWIFT = '.swift',
+	DART = '.dart',
+	VUE = '.vue',
+	SVELTE = '.svelte',
+	TEX = '.tex',
+	BIB = '.bib',
+	CU = '.cu',
+	CUH = '.cuh',
+	COMP = '.comp',
+	HPP = '.hpp',
+	HS = '.hs',
+	PROPERTIES = '.properties',
+	CS = '.cs'
+}
+
+// MIME type enums
+export enum MimeTypeApplication {
+	PDF = 'application/pdf'
+}
+
+export enum MimeTypeAudio {
+	MP3_MPEG = 'audio/mpeg',
+	MP3 = 'audio/mp3',
+	MP4 = 'audio/mp4',
+	WAV = 'audio/wav',
+	WEBM = 'audio/webm',
+	WEBM_OPUS = 'audio/webm;codecs=opus'
+}
+
+export enum MimeTypeImage {
+	JPEG = 'image/jpeg',
+	PNG = 'image/png',
+	GIF = 'image/gif',
+	WEBP = 'image/webp',
+	SVG = 'image/svg+xml'
+}
+
+export enum MimeTypeText {
+	PLAIN = 'text/plain',
+	MARKDOWN = 'text/markdown',
+	ASCIIDOC = 'text/asciidoc',
+	JAVASCRIPT = 'text/javascript',
+	JAVASCRIPT_APP = 'application/javascript',
+	TYPESCRIPT = 'text/typescript',
+	JSX = 'text/jsx',
+	TSX = 'text/tsx',
+	CSS = 'text/css',
+	HTML = 'text/html',
+	JSON = 'application/json',
+	XML_TEXT = 'text/xml',
+	XML_APP = 'application/xml',
+	YAML_TEXT = 'text/yaml',
+	YAML_APP = 'application/yaml',
+	CSV = 'text/csv',
+	PYTHON = 'text/x-python',
+	JAVA = 'text/x-java-source',
+	CPP_HDR = 'text/x-c++hdr',
+	CPP_SRC = 'text/x-c++src',
+	CSHARP = 'text/x-csharp',
+	HASKELL = 'text/x-haskell',
+	C_SRC = 'text/x-csrc',
+	C_HDR = 'text/x-chdr',
+	PHP = 'text/x-php',
+	RUBY = 'text/x-ruby',
+	GO = 'text/x-go',
+	RUST = 'text/x-rust',
+	SHELL = 'text/x-shellscript',
+	BAT = 'application/x-bat',
+	SQL = 'text/x-sql',
+	R = 'text/x-r',
+	SCALA = 'text/x-scala',
+	KOTLIN = 'text/x-kotlin',
+	SWIFT = 'text/x-swift',
+	DART = 'text/x-dart',
+	VUE = 'text/x-vue',
+	SVELTE = 'text/x-svelte',
+	TEX = 'text/x-tex',
+	TEX_APP = 'application/x-tex',
+	LATEX = 'application/x-latex',
+	BIBTEX = 'text/x-bibtex',
+	CUDA = 'text/x-cuda',
+	PROPERTIES = 'text/properties'
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/enums/index.ts b/llama.cpp/tools/server/webui/src/lib/enums/index.ts
new file mode 100644
index 0000000..83c86ca
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/enums/index.ts
@@ -0,0 +1,23 @@
+export { AttachmentType } from './attachment';
+
+export { ChatMessageStatsView } from './chat';
+
+export {
+	FileTypeCategory,
+	FileTypeImage,
+	FileTypeAudio,
+	FileTypePdf,
+	FileTypeText,
+	FileExtensionImage,
+	FileExtensionAudio,
+	FileExtensionPdf,
+	FileExtensionText,
+	MimeTypeApplication,
+	MimeTypeAudio,
+	MimeTypeImage,
+	MimeTypeText
+} from './files';
+
+export { ModelModality } from './model';
+
+export { ServerRole, ServerModelStatus } from './server';
diff --git a/llama.cpp/tools/server/webui/src/lib/enums/model.ts b/llama.cpp/tools/server/webui/src/lib/enums/model.ts
new file mode 100644
index 0000000..7729ecf
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/enums/model.ts
@@ -0,0 +1,5 @@
+export enum ModelModality {
+	TEXT = 'TEXT',
+	AUDIO = 'AUDIO',
+	VISION = 'VISION'
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/enums/server.ts b/llama.cpp/tools/server/webui/src/lib/enums/server.ts
new file mode 100644
index 0000000..7f30eab
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/enums/server.ts
@@ -0,0 +1,20 @@
+/**
+ * Server role enum - used for single/multi-model mode
+ */
+export enum ServerRole {
+	/** Single model mode - server running with a specific model loaded */
+	MODEL = 'model',
+	/** Router mode - server managing multiple model instances */
+	ROUTER = 'router'
+}
+
+/**
+ * Model status enum - matches tools/server/server-models.h from C++ server
+ * Used as the `value` field in the status object from /models endpoint
+ */
+export enum ServerModelStatus {
+	UNLOADED = 'unloaded',
+	LOADING = 'loading',
+	LOADED = 'loaded',
+	FAILED = 'failed'
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/hooks/is-mobile.svelte.ts b/llama.cpp/tools/server/webui/src/lib/hooks/is-mobile.svelte.ts
new file mode 100644
index 0000000..22c74f4
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/hooks/is-mobile.svelte.ts
@@ -0,0 +1,8 @@
+import { DEFAULT_MOBILE_BREAKPOINT } from '$lib/constants/viewport';
+import { MediaQuery } from 'svelte/reactivity';
+
+export class IsMobile extends MediaQuery {
+	constructor(breakpoint: number = DEFAULT_MOBILE_BREAKPOINT) {
+		super(`max-width: ${breakpoint - 1}px`);
+	}
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/hooks/use-model-change-validation.svelte.ts b/llama.cpp/tools/server/webui/src/lib/hooks/use-model-change-validation.svelte.ts
new file mode 100644
index 0000000..bb66615
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/hooks/use-model-change-validation.svelte.ts
@@ -0,0 +1,118 @@
+import { modelsStore } from '$lib/stores/models.svelte';
+import { isRouterMode } from '$lib/stores/server.svelte';
+import { toast } from 'svelte-sonner';
+
+interface UseModelChangeValidationOptions {
+	/**
+	 * Function to get required modalities for validation.
+	 * For ChatForm: () => usedModalities() - all messages
+	 * For ChatMessageAssistant: () => getModalitiesUpToMessage(messageId) - messages before
+	 */
+	getRequiredModalities: () => ModelModalities;
+
+	/**
+	 * Optional callback to execute after successful validation.
+	 * For ChatForm: undefined - just select model
+	 * For ChatMessageAssistant: (modelName) => onRegenerate(modelName)
+	 */
+	onSuccess?: (modelName: string) => void;
+
+	/**
+	 * Optional callback for rollback on validation failure.
+	 * For ChatForm: (previousId) => selectModelById(previousId)
+	 * For ChatMessageAssistant: undefined - no rollback needed
+	 */
+	onValidationFailure?: (previousModelId: string | null) => Promise<void>;
+}
+
+export function useModelChangeValidation(options: UseModelChangeValidationOptions) {
+	const { getRequiredModalities, onSuccess, onValidationFailure } = options;
+
+	let previousSelectedModelId: string | null = null;
+	const isRouter = $derived(isRouterMode());
+
+	async function handleModelChange(modelId: string, modelName: string): Promise<boolean> {
+		try {
+			// Store previous selection for potential rollback
+			if (onValidationFailure) {
+				previousSelectedModelId = modelsStore.selectedModelId;
+			}
+
+			// Load model if not already loaded (router mode only)
+			let hasLoadedModel = false;
+			const isModelLoadedBefore = modelsStore.isModelLoaded(modelName);
+
+			if (isRouter && !isModelLoadedBefore) {
+				try {
+					await modelsStore.loadModel(modelName);
+					hasLoadedModel = true;
+				} catch {
+					toast.error(`Failed to load model "${modelName}"`);
+					return false;
+				}
+			}
+
+			// Fetch model props to validate modalities
+			const props = await modelsStore.fetchModelProps(modelName);
+
+			if (props?.modalities) {
+				const requiredModalities = getRequiredModalities();
+
+				// Check if model supports required modalities
+				const missingModalities: string[] = [];
+				if (requiredModalities.vision && !props.modalities.vision) {
+					missingModalities.push('vision');
+				}
+				if (requiredModalities.audio && !props.modalities.audio) {
+					missingModalities.push('audio');
+				}
+
+				if (missingModalities.length > 0) {
+					toast.error(
+						`Model "${modelName}" doesn't support required modalities: ${missingModalities.join(', ')}. Please select a different model.`
+					);
+
+					// Unload the model if we just loaded it
+					if (isRouter && hasLoadedModel) {
+						try {
+							await modelsStore.unloadModel(modelName);
+						} catch (error) {
+							console.error('Failed to unload incompatible model:', error);
+						}
+					}
+
+					// Execute rollback callback if provided
+					if (onValidationFailure && previousSelectedModelId) {
+						await onValidationFailure(previousSelectedModelId);
+					}
+
+					return false;
+				}
+			}
+
+			// Select the model (validation passed)
+			await modelsStore.selectModelById(modelId);
+
+			// Execute success callback if provided
+			if (onSuccess) {
+				onSuccess(modelName);
+			}
+
+			return true;
+		} catch (error) {
+			console.error('Failed to change model:', error);
+			toast.error('Failed to validate model capabilities');
+
+			// Execute rollback callback on error if provided
+			if (onValidationFailure && previousSelectedModelId) {
+				await onValidationFailure(previousSelectedModelId);
+			}
+
+			return false;
+		}
+	}
+
+	return {
+		handleModelChange
+	};
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts b/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
new file mode 100644
index 0000000..c06cf28
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
@@ -0,0 +1,262 @@
+import { activeProcessingState } from '$lib/stores/chat.svelte';
+import { config } from '$lib/stores/settings.svelte';
+
+export interface LiveProcessingStats {
+	tokensProcessed: number;
+	totalTokens: number;
+	timeMs: number;
+	tokensPerSecond: number;
+	etaSecs?: number;
+}
+
+export interface LiveGenerationStats {
+	tokensGenerated: number;
+	timeMs: number;
+	tokensPerSecond: number;
+}
+
+export interface UseProcessingStateReturn {
+	readonly processingState: ApiProcessingState | null;
+	getProcessingDetails(): string[];
+	getProcessingMessage(): string;
+	getPromptProgressText(): string | null;
+	getLiveProcessingStats(): LiveProcessingStats | null;
+	getLiveGenerationStats(): LiveGenerationStats | null;
+	shouldShowDetails(): boolean;
+	startMonitoring(): void;
+	stopMonitoring(): void;
+}
+
+/**
+ * useProcessingState - Reactive processing state hook
+ *
+ * This hook provides reactive access to the processing state of the server.
+ * It directly reads from chatStore's reactive state and provides
+ * formatted processing details for UI display.
+ *
+ * **Features:**
+ * - Real-time processing state via direct reactive state binding
+ * - Context and output token tracking
+ * - Tokens per second calculation
+ * - Automatic updates when streaming data arrives
+ * - Supports multiple concurrent conversations
+ *
+ * @returns Hook interface with processing state and control methods
+ */
+export function useProcessingState(): UseProcessingStateReturn {
+	let isMonitoring = $state(false);
+	let lastKnownState = $state<ApiProcessingState | null>(null);
+	let lastKnownProcessingStats = $state<LiveProcessingStats | null>(null);
+
+	// Derive processing state reactively from chatStore's direct state
+	const processingState = $derived.by(() => {
+		if (!isMonitoring) {
+			return lastKnownState;
+		}
+		// Read directly from the reactive state export
+		return activeProcessingState();
+	});
+
+	// Track last known state for keepStatsVisible functionality
+	$effect(() => {
+		if (processingState && isMonitoring) {
+			lastKnownState = processingState;
+		}
+	});
+
+	// Track last known processing stats for when promptProgress disappears
+	$effect(() => {
+		if (processingState?.promptProgress) {
+			const { processed, total, time_ms, cache } = processingState.promptProgress;
+			const actualProcessed = processed - cache;
+			const actualTotal = total - cache;
+
+			if (actualProcessed > 0 && time_ms > 0) {
+				const tokensPerSecond = actualProcessed / (time_ms / 1000);
+				lastKnownProcessingStats = {
+					tokensProcessed: actualProcessed,
+					totalTokens: actualTotal,
+					timeMs: time_ms,
+					tokensPerSecond
+				};
+			}
+		}
+	});
+
+	function getETASecs(done: number, total: number, elapsedMs: number): number | undefined {
+		const elapsedSecs = elapsedMs / 1000;
+		const progressETASecs =
+			done === 0 || elapsedSecs < 0.5
+				? undefined // can be the case for the 0% progress report
+				: elapsedSecs * (total / done - 1);
+		return progressETASecs;
+	}
+
+	function startMonitoring(): void {
+		if (isMonitoring) return;
+		isMonitoring = true;
+	}
+
+	function stopMonitoring(): void {
+		if (!isMonitoring) return;
+		isMonitoring = false;
+
+		// Only clear last known state if keepStatsVisible is disabled
+		const currentConfig = config();
+		if (!currentConfig.keepStatsVisible) {
+			lastKnownState = null;
+			lastKnownProcessingStats = null;
+		}
+	}
+
+	function getProcessingMessage(): string {
+		if (!processingState) {
+			return 'Processing...';
+		}
+
+		switch (processingState.status) {
+			case 'initializing':
+				return 'Initializing...';
+			case 'preparing':
+				if (processingState.progressPercent !== undefined) {
+					return `Processing (${processingState.progressPercent}%)`;
+				}
+				return 'Preparing response...';
+			case 'generating':
+				return '';
+			default:
+				return 'Processing...';
+		}
+	}
+
+	function getProcessingDetails(): string[] {
+		// Use current processing state or fall back to last known state
+		const stateToUse = processingState || lastKnownState;
+		if (!stateToUse) {
+			return [];
+		}
+
+		const details: string[] = [];
+
+		// Always show context info when we have valid data
+		if (stateToUse.contextUsed >= 0 && stateToUse.contextTotal > 0) {
+			const contextPercent = Math.round((stateToUse.contextUsed / stateToUse.contextTotal) * 100);
+
+			details.push(
+				`Context: ${stateToUse.contextUsed}/${stateToUse.contextTotal} (${contextPercent}%)`
+			);
+		}
+
+		if (stateToUse.outputTokensUsed > 0) {
+			// Handle infinite max_tokens (-1) case
+			if (stateToUse.outputTokensMax <= 0) {
+				details.push(`Output: ${stateToUse.outputTokensUsed}/∞`);
+			} else {
+				const outputPercent = Math.round(
+					(stateToUse.outputTokensUsed / stateToUse.outputTokensMax) * 100
+				);
+
+				details.push(
+					`Output: ${stateToUse.outputTokensUsed}/${stateToUse.outputTokensMax} (${outputPercent}%)`
+				);
+			}
+		}
+
+		if (stateToUse.tokensPerSecond && stateToUse.tokensPerSecond > 0) {
+			details.push(`${stateToUse.tokensPerSecond.toFixed(1)} tokens/sec`);
+		}
+
+		if (stateToUse.speculative) {
+			details.push('Speculative decoding enabled');
+		}
+
+		return details;
+	}
+
+	function shouldShowDetails(): boolean {
+		return processingState !== null && processingState.status !== 'idle';
+	}
+
+	/**
+	 * Returns a short progress message with percent
+	 */
+	function getPromptProgressText(): string | null {
+		if (!processingState?.promptProgress) return null;
+
+		const { processed, total, cache } = processingState.promptProgress;
+
+		const actualProcessed = processed - cache;
+		const actualTotal = total - cache;
+		const percent = Math.round((actualProcessed / actualTotal) * 100);
+		const eta = getETASecs(actualProcessed, actualTotal, processingState.promptProgress.time_ms);
+
+		if (eta !== undefined) {
+			const etaSecs = Math.ceil(eta);
+			return `Processing ${percent}% (ETA: ${etaSecs}s)`;
+		}
+
+		return `Processing ${percent}%`;
+	}
+
+	/**
+	 * Returns live processing statistics for display (prompt processing phase)
+	 * Returns last known stats when promptProgress becomes unavailable
+	 */
+	function getLiveProcessingStats(): LiveProcessingStats | null {
+		if (processingState?.promptProgress) {
+			const { processed, total, time_ms, cache } = processingState.promptProgress;
+
+			const actualProcessed = processed - cache;
+			const actualTotal = total - cache;
+
+			if (actualProcessed > 0 && time_ms > 0) {
+				const tokensPerSecond = actualProcessed / (time_ms / 1000);
+
+				return {
+					tokensProcessed: actualProcessed,
+					totalTokens: actualTotal,
+					timeMs: time_ms,
+					tokensPerSecond
+				};
+			}
+		}
+
+		// Return last known stats if promptProgress is no longer available
+		return lastKnownProcessingStats;
+	}
+
+	/**
+	 * Returns live generation statistics for display (token generation phase)
+	 */
+	function getLiveGenerationStats(): LiveGenerationStats | null {
+		if (!processingState) return null;
+
+		const { tokensDecoded, tokensPerSecond } = processingState;
+
+		if (tokensDecoded <= 0) return null;
+
+		// Calculate time from tokens and speed
+		const timeMs =
+			tokensPerSecond && tokensPerSecond > 0 ? (tokensDecoded / tokensPerSecond) * 1000 : 0;
+
+		return {
+			tokensGenerated: tokensDecoded,
+			timeMs,
+			tokensPerSecond: tokensPerSecond || 0
+		};
+	}
+
+	return {
+		get processingState() {
+			return processingState;
+		},
+		getProcessingDetails,
+		getProcessingMessage,
+		getPromptProgressText,
+		getLiveProcessingStats,
+		getLiveGenerationStats,
+		shouldShowDetails,
+		startMonitoring,
+		stopMonitoring
+	};
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts b/llama.cpp/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
new file mode 100644
index 0000000..6f0e03e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
@@ -0,0 +1,162 @@
+/**
+ * Rehype plugin to enhance code blocks with wrapper, header, and action buttons.
+ *
+ * Wraps <pre><code> elements with a container that includes:
+ * - Language label
+ * - Copy button
+ * - Preview button (for HTML code blocks)
+ *
+ * This operates directly on the HAST tree for better performance,
+ * avoiding the need to stringify and re-parse HTML.
+ */
+
+import type { Plugin } from 'unified';
+import type { Root, Element, ElementContent } from 'hast';
+import { visit } from 'unist-util-visit';
+
+declare global {
+	interface Window {
+		idxCodeBlock?: number;
+	}
+}
+
+const COPY_ICON_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-copy-icon lucide-copy"><rect width="14" height="14" x="8" y="8" rx="2" ry="2"/><path d="M4 16c-1.1 0-2-.9-2-2V4c0-1.1.9-2 2-2h10c1.1 0 2 .9 2 2"/></svg>`;
+
+const PREVIEW_ICON_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-eye lucide-eye-icon"><path d="M2.062 12.345a1 1 0 0 1 0-.69C3.5 7.73 7.36 5 12 5s8.5 2.73 9.938 6.655a1 1 0 0 1 0 .69C20.5 16.27 16.64 19 12 19s-8.5-2.73-9.938-6.655"/><circle cx="12" cy="12" r="3"/></svg>`;
+
+/**
+ * Creates an SVG element node from raw SVG string.
+ * Since we can't parse HTML in HAST directly, we use the raw property.
+ */
+function createRawHtmlElement(html: string): Element {
+	return {
+		type: 'element',
+		tagName: 'span',
+		properties: {},
+		children: [{ type: 'raw', value: html } as unknown as ElementContent]
+	};
+}
+
+function createCopyButton(codeId: string): Element {
+	return {
+		type: 'element',
+		tagName: 'button',
+		properties: {
+			className: ['copy-code-btn'],
+			'data-code-id': codeId,
+			title: 'Copy code',
+			type: 'button'
+		},
+		children: [createRawHtmlElement(COPY_ICON_SVG)]
+	};
+}
+
+function createPreviewButton(codeId: string): Element {
+	return {
+		type: 'element',
+		tagName: 'button',
+		properties: {
+			className: ['preview-code-btn'],
+			'data-code-id': codeId,
+			title: 'Preview code',
+			type: 'button'
+		},
+		children: [createRawHtmlElement(PREVIEW_ICON_SVG)]
+	};
+}
+
+function createHeader(language: string, codeId: string): Element {
+	const actions: Element[] = [createCopyButton(codeId)];
+
+	if (language.toLowerCase() === 'html') {
+		actions.push(createPreviewButton(codeId));
+	}
+
+	return {
+		type: 'element',
+		tagName: 'div',
+		properties: { className: ['code-block-header'] },
+		children: [
+			{
+				type: 'element',
+				tagName: 'span',
+				properties: { className: ['code-language'] },
+				children: [{ type: 'text', value: language }]
+			},
+			{
+				type: 'element',
+				tagName: 'div',
+				properties: { className: ['code-block-actions'] },
+				children: actions
+			}
+		]
+	};
+}
+
+function createWrapper(header: Element, preElement: Element): Element {
+	return {
+		type: 'element',
+		tagName: 'div',
+		properties: { className: ['code-block-wrapper'] },
+		children: [header, preElement]
+	};
+}
+
+function extractLanguage(codeElement: Element): string {
+	const className = codeElement.properties?.className;
+	if (!Array.isArray(className)) return 'text';
+
+	for (const cls of className) {
+		if (typeof cls === 'string' && cls.startsWith('language-')) {
+			return cls.replace('language-', '');
+		}
+	}
+
+	return 'text';
+}
+
+/**
+ * Generates a unique code block ID using a global counter.
+ */
+function generateCodeId(): string {
+	if (typeof window !== 'undefined') {
+		return `code-${(window.idxCodeBlock = (window.idxCodeBlock ?? 0) + 1)}`;
+	}
+	// Fallback for SSR - use timestamp + random
+	return `code-${Date.now()}-${Math.random().toString(36).slice(2, 7)}`;
+}
+
+/**
+ * Rehype plugin to enhance code blocks with wrapper, header, and action buttons.
+ * This plugin wraps <pre><code> elements with a container that includes:
+ * - Language label
+ * - Copy button
+ * - Preview button (for HTML code blocks)
+ */
+export const rehypeEnhanceCodeBlocks: Plugin<[], Root> = () => {
+	return (tree: Root) => {
+		visit(tree, 'element', (node: Element, index, parent) => {
+			if (node.tagName !== 'pre' || !parent || index === undefined) return;
+
+			const codeElement = node.children.find(
+				(child): child is Element => child.type === 'element' && child.tagName === 'code'
+			);
+
+			if (!codeElement) return;
+
+			const language = extractLanguage(codeElement);
+			const codeId = generateCodeId();
+
+			codeElement.properties = {
+				...codeElement.properties,
+				'data-code-id': codeId
+			};
+
+			const header = createHeader(language, codeId);
+			const wrapper = createWrapper(header, node);
+
+			// Replace pre with wrapper in parent
+			(parent.children as ElementContent[])[index] = wrapper;
+		});
+	};
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/markdown/enhance-links.ts b/llama.cpp/tools/server/webui/src/lib/markdown/enhance-links.ts
new file mode 100644
index 0000000..b5fbcbd
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/markdown/enhance-links.ts
@@ -0,0 +1,33 @@
+/**
+ * Rehype plugin to enhance links with security attributes.
+ *
+ * Adds target="_blank" and rel="noopener noreferrer" to all anchor elements,
+ * ensuring external links open in new tabs safely.
+ */
+
+import type { Plugin } from 'unified';
+import type { Root, Element } from 'hast';
+import { visit } from 'unist-util-visit';
+
+/**
+ * Rehype plugin that adds security attributes to all links.
+ * This plugin ensures external links open in new tabs safely by adding:
+ * - target="_blank"
+ * - rel="noopener noreferrer"
+ */
+export const rehypeEnhanceLinks: Plugin<[], Root> = () => {
+	return (tree: Root) => {
+		visit(tree, 'element', (node: Element) => {
+			if (node.tagName !== 'a') return;
+
+			const props = node.properties ?? {};
+
+			// Only modify if href exists
+			if (!props.href) return;
+
+			props.target = '_blank';
+			props.rel = 'noopener noreferrer';
+			node.properties = props;
+		});
+	};
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/markdown/literal-html.ts b/llama.cpp/tools/server/webui/src/lib/markdown/literal-html.ts
new file mode 100644
index 0000000..d4ace01
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/markdown/literal-html.ts
@@ -0,0 +1,121 @@
+import type { Plugin } from 'unified';
+import { visit } from 'unist-util-visit';
+import type { Break, Content, Paragraph, PhrasingContent, Root, Text } from 'mdast';
+import { LINE_BREAK, NBSP, PHRASE_PARENTS, TAB_AS_SPACES } from '$lib/constants/literal-html';
+
+/**
+ * remark plugin that rewrites raw HTML nodes into plain-text equivalents.
+ *
+ * remark parses inline HTML into `html` nodes even when we do not want to render
+ * them. We turn each of those nodes into regular text (plus `<br>` break markers)
+ * so the downstream rehype pipeline escapes the characters instead of executing
+ * them. Leading spaces and tab characters are converted to non‑breaking spaces to
+ * keep indentation identical to the original author input.
+ */
+
+function preserveIndent(line: string): string {
+	let index = 0;
+	let output = '';
+
+	while (index < line.length) {
+		const char = line[index];
+
+		if (char === ' ') {
+			output += NBSP;
+			index += 1;
+			continue;
+		}
+
+		if (char === '\t') {
+			output += TAB_AS_SPACES;
+			index += 1;
+			continue;
+		}
+
+		break;
+	}
+
+	return output + line.slice(index);
+}
+
+function createLiteralChildren(value: string): PhrasingContent[] {
+	const lines = value.split(LINE_BREAK);
+	const nodes: PhrasingContent[] = [];
+
+	for (const [lineIndex, rawLine] of lines.entries()) {
+		if (lineIndex > 0) {
+			nodes.push({ type: 'break' } as Break as unknown as PhrasingContent);
+		}
+
+		nodes.push({
+			type: 'text',
+			value: preserveIndent(rawLine)
+		} as Text as unknown as PhrasingContent);
+	}
+
+	if (!nodes.length) {
+		nodes.push({ type: 'text', value: '' } as Text as unknown as PhrasingContent);
+	}
+
+	return nodes;
+}
+
+export const remarkLiteralHtml: Plugin<[], Root> = () => {
+	return (tree) => {
+		visit(tree, 'html', (node, index, parent) => {
+			if (!parent || typeof index !== 'number') {
+				return;
+			}
+
+			const replacement = createLiteralChildren(node.value);
+
+			if (!PHRASE_PARENTS.has(parent.type as string)) {
+				const paragraph: Paragraph = {
+					type: 'paragraph',
+					children: replacement as Paragraph['children'],
+					data: { literalHtml: true }
+				};
+
+				const siblings = parent.children as unknown as Content[];
+				siblings.splice(index, 1, paragraph as unknown as Content);
+
+				if (index > 0) {
+					const previous = siblings[index - 1] as Paragraph | undefined;
+
+					if (
+						previous?.type === 'paragraph' &&
+						(previous.data as { literalHtml?: boolean } | undefined)?.literalHtml
+					) {
+						const prevChildren = previous.children as unknown as PhrasingContent[];
+
+						if (prevChildren.length) {
+							const lastChild = prevChildren[prevChildren.length - 1];
+
+							if (lastChild.type !== 'break') {
+								prevChildren.push({
+									type: 'break'
+								} as Break as unknown as PhrasingContent);
+							}
+						}
+
+						prevChildren.push(...(paragraph.children as unknown as PhrasingContent[]));
+
+						siblings.splice(index, 1);
+
+						return index;
+					}
+				}
+
+				return index + 1;
+			}
+
+			(parent.children as unknown as PhrasingContent[]).splice(
+				index,
+				1,
+				...(replacement as unknown as PhrasingContent[])
+			);
+
+			return index + replacement.length;
+		});
+	};
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/markdown/table-html-restorer.ts b/llama.cpp/tools/server/webui/src/lib/markdown/table-html-restorer.ts
new file mode 100644
index 0000000..918aa46
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/markdown/table-html-restorer.ts
@@ -0,0 +1,181 @@
+/**
+ * Rehype plugin to restore limited HTML elements inside Markdown table cells.
+ *
+ * ## Problem
+ * The remark/rehype pipeline neutralizes inline HTML as literal text
+ * (remarkLiteralHtml) so that XML/HTML snippets in LLM responses display
+ * as-is instead of being rendered. This causes <br> and <ul> markup in
+ * table cells to show as plain text.
+ *
+ * ## Solution
+ * This plugin traverses the HAST post-conversion, parses whitelisted HTML
+ * patterns from text nodes, and replaces them with actual HAST element nodes
+ * that will be rendered as real HTML.
+ *
+ * ## Supported HTML
+ * - `<br>` / `<br/>` / `<br />` - Line breaks (inline)
+ * - `<ul><li>...</li></ul>` - Unordered lists (block)
+ *
+ * ## Key Implementation Details
+ *
+ * ### 1. Sibling Combination (Critical)
+ * The Markdown pipeline may fragment content across multiple text nodes and `<br>`
+ * elements. For example, `<ul><li>a</li></ul>` might arrive as:
+ *   - Text: `"<ul>"`
+ *   - Element: `<br>`
+ *   - Text: `"<li>a</li></ul>"`
+ *
+ * We must combine consecutive text nodes and `<br>` elements into a single string
+ * before attempting to parse list markup. Without this, list detection fails.
+ *
+ * ### 2. visitParents for Deep Traversal
+ * Table cell content may be wrapped in intermediate elements (e.g., `<p>` tags).
+ * Using `visitParents` instead of direct child iteration ensures we find text
+ * nodes at any depth within the cell.
+ *
+ * ### 3. Reference Comparison for No-Op Detection
+ * When checking if `<br>` expansion changed anything, we compare:
+ *   `expanded.length !== 1 || expanded[0] !== textNode`
+ *
+ * This catches both cases:
+ * - Multiple nodes created (text was split)
+ * - Single NEW node created (original had only `<br>`, now it's an element)
+ *
+ * A simple `length > 1` check would miss the single `<br>` case.
+ *
+ * ### 4. Strict List Validation
+ * `parseList()` rejects malformed markup by checking for garbage text between
+ * `<li>` elements. This prevents creating broken DOM from partial matches like
+ * `<ul>garbage<li>a</li></ul>`.
+ *
+ * ### 5. Newline Substitution for `<br>` in Combined String
+ * When combining siblings, existing `<br>` elements become `\n` in the combined
+ * string. This allows list content to span visual lines while still being parsed
+ * as a single unit.
+ *
+ * @example
+ * // Input Markdown:
+ * // | Feature | Notes |
+ * // |---------|-------|
+ * // | Multi-line | First<br>Second |
+ * // | List | <ul><li>A</li><li>B</li></ul> |
+ * //
+ * // Without this plugin: <br> and <ul> render as literal text
+ * // With this plugin: <br> becomes line break, <ul> becomes actual list
+ */
+
+import type { Plugin } from 'unified';
+import type { Element, ElementContent, Root, Text } from 'hast';
+import { visit } from 'unist-util-visit';
+import { visitParents } from 'unist-util-visit-parents';
+import { BR_PATTERN, LIST_PATTERN, LI_PATTERN } from '$lib/constants/table-html-restorer';
+
+/**
+ * Expands text containing `<br>` tags into an array of text nodes and br elements.
+ */
+function expandBrTags(value: string): ElementContent[] {
+	const matches = [...value.matchAll(BR_PATTERN)];
+	if (!matches.length) return [{ type: 'text', value } as Text];
+
+	const result: ElementContent[] = [];
+	let cursor = 0;
+
+	for (const m of matches) {
+		if (m.index! > cursor) {
+			result.push({ type: 'text', value: value.slice(cursor, m.index) } as Text);
+		}
+		result.push({ type: 'element', tagName: 'br', properties: {}, children: [] } as Element);
+		cursor = m.index! + m[0].length;
+	}
+
+	if (cursor < value.length) {
+		result.push({ type: 'text', value: value.slice(cursor) } as Text);
+	}
+
+	return result;
+}
+
+/**
+ * Parses a `<ul><li>...</li></ul>` string into a HAST element.
+ * Returns null if the markup is malformed or contains unexpected content.
+ */
+function parseList(value: string): Element | null {
+	const match = value.trim().match(LIST_PATTERN);
+	if (!match) return null;
+
+	const body = match[1];
+	const items: ElementContent[] = [];
+	let cursor = 0;
+
+	for (const liMatch of body.matchAll(LI_PATTERN)) {
+		// Reject if there's non-whitespace between list items
+		if (body.slice(cursor, liMatch.index!).trim()) return null;
+
+		items.push({
+			type: 'element',
+			tagName: 'li',
+			properties: {},
+			children: expandBrTags(liMatch[1] ?? '')
+		} as Element);
+
+		cursor = liMatch.index! + liMatch[0].length;
+	}
+
+	// Reject if no items found or trailing garbage exists
+	if (!items.length || body.slice(cursor).trim()) return null;
+
+	return { type: 'element', tagName: 'ul', properties: {}, children: items } as Element;
+}
+
+/**
+ * Processes a single table cell, restoring HTML elements from text content.
+ */
+function processCell(cell: Element) {
+	visitParents(cell, 'text', (textNode: Text, ancestors) => {
+		const parent = ancestors[ancestors.length - 1];
+		if (!parent || parent.type !== 'element') return;
+
+		const parentEl = parent as Element;
+		const siblings = parentEl.children as ElementContent[];
+		const startIndex = siblings.indexOf(textNode as ElementContent);
+		if (startIndex === -1) return;
+
+		// Combine consecutive text nodes and <br> elements into one string
+		let combined = '';
+		let endIndex = startIndex;
+
+		for (let i = startIndex; i < siblings.length; i++) {
+			const sib = siblings[i];
+			if (sib.type === 'text') {
+				combined += (sib as Text).value;
+				endIndex = i;
+			} else if (sib.type === 'element' && (sib as Element).tagName === 'br') {
+				combined += '\n';
+				endIndex = i;
+			} else {
+				break;
+			}
+		}
+
+		// Try parsing as list first (replaces entire combined range)
+		const list = parseList(combined);
+		if (list) {
+			siblings.splice(startIndex, endIndex - startIndex + 1, list);
+			return;
+		}
+
+		// Otherwise, just expand <br> tags in this text node
+		const expanded = expandBrTags(textNode.value);
+		if (expanded.length !== 1 || expanded[0] !== textNode) {
+			siblings.splice(startIndex, 1, ...expanded);
+		}
+	});
+}
+
+export const rehypeRestoreTableHtml: Plugin<[], Root> = () => (tree) => {
+	visit(tree, 'element', (node: Element) => {
+		if (node.tagName === 'td' || node.tagName === 'th') {
+			processCell(node);
+		}
+	});
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/services/chat.ts b/llama.cpp/tools/server/webui/src/lib/services/chat.ts
new file mode 100644
index 0000000..02fc638
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/services/chat.ts
@@ -0,0 +1,784 @@
+import { getJsonHeaders } from '$lib/utils';
+import { AttachmentType } from '$lib/enums';
+
+/**
+ * ChatService - Low-level API communication layer for Chat Completions
+ *
+ * **Terminology - Chat vs Conversation:**
+ * - **Chat**: The active interaction space with the Chat Completions API. This service
+ *   handles the real-time communication with the AI backend - sending messages, receiving
+ *   streaming responses, and managing request lifecycles. "Chat" is ephemeral and runtime-focused.
+ * - **Conversation**: The persistent database entity storing all messages and metadata.
+ *   Managed by ConversationsService/Store, conversations persist across sessions.
+ *
+ * This service handles direct communication with the llama-server's Chat Completions API.
+ * It provides the network layer abstraction for AI model interactions while remaining
+ * stateless and focused purely on API communication.
+ *
+ * **Architecture & Relationships:**
+ * - **ChatService** (this class): Stateless API communication layer
+ *   - Handles HTTP requests/responses with the llama-server
+ *   - Manages streaming and non-streaming response parsing
+ *   - Provides per-conversation request abortion capabilities
+ *   - Converts database messages to API format
+ *   - Handles error translation for server responses
+ *
+ * - **chatStore**: Uses ChatService for all AI model communication
+ * - **conversationsStore**: Provides message context for API requests
+ *
+ * **Key Responsibilities:**
+ * - Message format conversion (DatabaseMessage → API format)
+ * - Streaming response handling with real-time callbacks
+ * - Reasoning content extraction and processing
+ * - File attachment processing (images, PDFs, audio, text)
+ * - Request lifecycle management (abort via AbortSignal)
+ */
+export class ChatService {
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Messaging
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Sends a chat completion request to the llama.cpp server.
+	 * Supports both streaming and non-streaming responses with comprehensive parameter configuration.
+	 * Automatically converts database messages with attachments to the appropriate API format.
+	 *
+	 * @param messages - Array of chat messages to send to the API (supports both ApiChatMessageData and DatabaseMessage with attachments)
+	 * @param options - Configuration options for the chat completion request. See `SettingsChatServiceOptions` type for details.
+	 * @returns {Promise<string | void>} that resolves to the complete response string (non-streaming) or void (streaming)
+	 * @throws {Error} if the request fails or is aborted
+	 */
+	static async sendMessage(
+		messages: ApiChatMessageData[] | (DatabaseMessage & { extra?: DatabaseMessageExtra[] })[],
+		options: SettingsChatServiceOptions = {},
+		conversationId?: string,
+		signal?: AbortSignal
+	): Promise<string | void> {
+		const {
+			stream,
+			onChunk,
+			onComplete,
+			onError,
+			onReasoningChunk,
+			onToolCallChunk,
+			onModel,
+			onTimings,
+			// Generation parameters
+			temperature,
+			max_tokens,
+			// Sampling parameters
+			dynatemp_range,
+			dynatemp_exponent,
+			top_k,
+			top_p,
+			min_p,
+			xtc_probability,
+			xtc_threshold,
+			typ_p,
+			// Penalty parameters
+			repeat_last_n,
+			repeat_penalty,
+			presence_penalty,
+			frequency_penalty,
+			dry_multiplier,
+			dry_base,
+			dry_allowed_length,
+			dry_penalty_last_n,
+			// Other parameters
+			samplers,
+			backend_sampling,
+			custom,
+			timings_per_token,
+			// Config options
+			disableReasoningFormat
+		} = options;
+
+		const normalizedMessages: ApiChatMessageData[] = messages
+			.map((msg) => {
+				if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
+					const dbMsg = msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] };
+					return ChatService.convertDbMessageToApiChatMessageData(dbMsg);
+				} else {
+					return msg as ApiChatMessageData;
+				}
+			})
+			.filter((msg) => {
+				// Filter out empty system messages
+				if (msg.role === 'system') {
+					const content = typeof msg.content === 'string' ? msg.content : '';
+
+					return content.trim().length > 0;
+				}
+
+				return true;
+			});
+
+		const requestBody: ApiChatCompletionRequest = {
+			messages: normalizedMessages.map((msg: ApiChatMessageData) => ({
+				role: msg.role,
+				content: msg.content
+			})),
+			stream,
+			return_progress: stream ? true : undefined
+		};
+
+		// Include model in request if provided (required in ROUTER mode)
+		if (options.model) {
+			requestBody.model = options.model;
+		}
+
+		requestBody.reasoning_format = disableReasoningFormat ? 'none' : 'auto';
+
+		if (temperature !== undefined) requestBody.temperature = temperature;
+		if (max_tokens !== undefined) {
+			// Set max_tokens to -1 (infinite) when explicitly configured as 0 or null
+			requestBody.max_tokens = max_tokens !== null && max_tokens !== 0 ? max_tokens : -1;
+		}
+
+		if (dynatemp_range !== undefined) requestBody.dynatemp_range = dynatemp_range;
+		if (dynatemp_exponent !== undefined) requestBody.dynatemp_exponent = dynatemp_exponent;
+		if (top_k !== undefined) requestBody.top_k = top_k;
+		if (top_p !== undefined) requestBody.top_p = top_p;
+		if (min_p !== undefined) requestBody.min_p = min_p;
+		if (xtc_probability !== undefined) requestBody.xtc_probability = xtc_probability;
+		if (xtc_threshold !== undefined) requestBody.xtc_threshold = xtc_threshold;
+		if (typ_p !== undefined) requestBody.typ_p = typ_p;
+
+		if (repeat_last_n !== undefined) requestBody.repeat_last_n = repeat_last_n;
+		if (repeat_penalty !== undefined) requestBody.repeat_penalty = repeat_penalty;
+		if (presence_penalty !== undefined) requestBody.presence_penalty = presence_penalty;
+		if (frequency_penalty !== undefined) requestBody.frequency_penalty = frequency_penalty;
+		if (dry_multiplier !== undefined) requestBody.dry_multiplier = dry_multiplier;
+		if (dry_base !== undefined) requestBody.dry_base = dry_base;
+		if (dry_allowed_length !== undefined) requestBody.dry_allowed_length = dry_allowed_length;
+		if (dry_penalty_last_n !== undefined) requestBody.dry_penalty_last_n = dry_penalty_last_n;
+
+		if (samplers !== undefined) {
+			requestBody.samplers =
+				typeof samplers === 'string'
+					? samplers.split(';').filter((s: string) => s.trim())
+					: samplers;
+		}
+
+		if (backend_sampling !== undefined) requestBody.backend_sampling = backend_sampling;
+
+		if (timings_per_token !== undefined) requestBody.timings_per_token = timings_per_token;
+
+		if (custom) {
+			try {
+				const customParams = typeof custom === 'string' ? JSON.parse(custom) : custom;
+				Object.assign(requestBody, customParams);
+			} catch (error) {
+				console.warn('Failed to parse custom parameters:', error);
+			}
+		}
+
+		try {
+			const response = await fetch(`./v1/chat/completions`, {
+				method: 'POST',
+				headers: getJsonHeaders(),
+				body: JSON.stringify(requestBody),
+				signal
+			});
+
+			if (!response.ok) {
+				const error = await ChatService.parseErrorResponse(response);
+				if (onError) {
+					onError(error);
+				}
+				throw error;
+			}
+
+			if (stream) {
+				await ChatService.handleStreamResponse(
+					response,
+					onChunk,
+					onComplete,
+					onError,
+					onReasoningChunk,
+					onToolCallChunk,
+					onModel,
+					onTimings,
+					conversationId,
+					signal
+				);
+				return;
+			} else {
+				return ChatService.handleNonStreamResponse(
+					response,
+					onComplete,
+					onError,
+					onToolCallChunk,
+					onModel
+				);
+			}
+		} catch (error) {
+			if (error instanceof Error && error.name === 'AbortError') {
+				console.log('Chat completion request was aborted');
+				return;
+			}
+
+			let userFriendlyError: Error;
+
+			if (error instanceof Error) {
+				if (error.name === 'TypeError' && error.message.includes('fetch')) {
+					userFriendlyError = new Error(
+						'Unable to connect to server - please check if the server is running'
+					);
+					userFriendlyError.name = 'NetworkError';
+				} else if (error.message.includes('ECONNREFUSED')) {
+					userFriendlyError = new Error('Connection refused - server may be offline');
+					userFriendlyError.name = 'NetworkError';
+				} else if (error.message.includes('ETIMEDOUT')) {
+					userFriendlyError = new Error('Request timed out - the server took too long to respond');
+					userFriendlyError.name = 'TimeoutError';
+				} else {
+					userFriendlyError = error;
+				}
+			} else {
+				userFriendlyError = new Error('Unknown error occurred while sending message');
+			}
+
+			console.error('Error in sendMessage:', error);
+			if (onError) {
+				onError(userFriendlyError);
+			}
+			throw userFriendlyError;
+		}
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Streaming
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Handles streaming response from the chat completion API
+	 * @param response - The Response object from the fetch request
+	 * @param onChunk - Optional callback invoked for each content chunk received
+	 * @param onComplete - Optional callback invoked when the stream is complete with full response
+	 * @param onError - Optional callback invoked if an error occurs during streaming
+	 * @param onReasoningChunk - Optional callback invoked for each reasoning content chunk
+	 * @param conversationId - Optional conversation ID for per-conversation state tracking
+	 * @returns {Promise<void>} Promise that resolves when streaming is complete
+	 * @throws {Error} if the stream cannot be read or parsed
+	 */
+	private static async handleStreamResponse(
+		response: Response,
+		onChunk?: (chunk: string) => void,
+		onComplete?: (
+			response: string,
+			reasoningContent?: string,
+			timings?: ChatMessageTimings,
+			toolCalls?: string
+		) => void,
+		onError?: (error: Error) => void,
+		onReasoningChunk?: (chunk: string) => void,
+		onToolCallChunk?: (chunk: string) => void,
+		onModel?: (model: string) => void,
+		onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
+		conversationId?: string,
+		abortSignal?: AbortSignal
+	): Promise<void> {
+		const reader = response.body?.getReader();
+
+		if (!reader) {
+			throw new Error('No response body');
+		}
+
+		const decoder = new TextDecoder();
+		let aggregatedContent = '';
+		let fullReasoningContent = '';
+		let aggregatedToolCalls: ApiChatCompletionToolCall[] = [];
+		let lastTimings: ChatMessageTimings | undefined;
+		let streamFinished = false;
+		let modelEmitted = false;
+		let toolCallIndexOffset = 0;
+		let hasOpenToolCallBatch = false;
+
+		const finalizeOpenToolCallBatch = () => {
+			if (!hasOpenToolCallBatch) {
+				return;
+			}
+
+			toolCallIndexOffset = aggregatedToolCalls.length;
+			hasOpenToolCallBatch = false;
+		};
+
+		const processToolCallDelta = (toolCalls?: ApiChatCompletionToolCallDelta[]) => {
+			if (!toolCalls || toolCalls.length === 0) {
+				return;
+			}
+
+			aggregatedToolCalls = ChatService.mergeToolCallDeltas(
+				aggregatedToolCalls,
+				toolCalls,
+				toolCallIndexOffset
+			);
+
+			if (aggregatedToolCalls.length === 0) {
+				return;
+			}
+
+			hasOpenToolCallBatch = true;
+
+			const serializedToolCalls = JSON.stringify(aggregatedToolCalls);
+
+			if (!serializedToolCalls) {
+				return;
+			}
+
+			if (!abortSignal?.aborted) {
+				onToolCallChunk?.(serializedToolCalls);
+			}
+		};
+
+		try {
+			let chunk = '';
+			while (true) {
+				if (abortSignal?.aborted) break;
+
+				const { done, value } = await reader.read();
+				if (done) break;
+
+				if (abortSignal?.aborted) break;
+
+				chunk += decoder.decode(value, { stream: true });
+				const lines = chunk.split('\n');
+				chunk = lines.pop() || '';
+
+				for (const line of lines) {
+					if (abortSignal?.aborted) break;
+
+					if (line.startsWith('data: ')) {
+						const data = line.slice(6);
+						if (data === '[DONE]') {
+							streamFinished = true;
+							continue;
+						}
+
+						try {
+							const parsed: ApiChatCompletionStreamChunk = JSON.parse(data);
+							const content = parsed.choices[0]?.delta?.content;
+							const reasoningContent = parsed.choices[0]?.delta?.reasoning_content;
+							const toolCalls = parsed.choices[0]?.delta?.tool_calls;
+							const timings = parsed.timings;
+							const promptProgress = parsed.prompt_progress;
+
+							const chunkModel = ChatService.extractModelName(parsed);
+							if (chunkModel && !modelEmitted) {
+								modelEmitted = true;
+								onModel?.(chunkModel);
+							}
+
+							if (promptProgress) {
+								ChatService.notifyTimings(undefined, promptProgress, onTimings);
+							}
+
+							if (timings) {
+								ChatService.notifyTimings(timings, promptProgress, onTimings);
+								lastTimings = timings;
+							}
+
+							if (content) {
+								finalizeOpenToolCallBatch();
+								aggregatedContent += content;
+								if (!abortSignal?.aborted) {
+									onChunk?.(content);
+								}
+							}
+
+							if (reasoningContent) {
+								finalizeOpenToolCallBatch();
+								fullReasoningContent += reasoningContent;
+								if (!abortSignal?.aborted) {
+									onReasoningChunk?.(reasoningContent);
+								}
+							}
+
+							processToolCallDelta(toolCalls);
+						} catch (e) {
+							console.error('Error parsing JSON chunk:', e);
+						}
+					}
+				}
+
+				if (abortSignal?.aborted) break;
+			}
+
+			if (abortSignal?.aborted) return;
+
+			if (streamFinished) {
+				finalizeOpenToolCallBatch();
+
+				const finalToolCalls =
+					aggregatedToolCalls.length > 0 ? JSON.stringify(aggregatedToolCalls) : undefined;
+
+				onComplete?.(
+					aggregatedContent,
+					fullReasoningContent || undefined,
+					lastTimings,
+					finalToolCalls
+				);
+			}
+		} catch (error) {
+			const err = error instanceof Error ? error : new Error('Stream error');
+
+			onError?.(err);
+
+			throw err;
+		} finally {
+			reader.releaseLock();
+		}
+	}
+
+	/**
+	 * Handles non-streaming response from the chat completion API.
+	 * Parses the JSON response and extracts the generated content.
+	 *
+	 * @param response - The fetch Response object containing the JSON data
+	 * @param onComplete - Optional callback invoked when response is successfully parsed
+	 * @param onError - Optional callback invoked if an error occurs during parsing
+	 * @returns {Promise<string>} Promise that resolves to the generated content string
+	 * @throws {Error} if the response cannot be parsed or is malformed
+	 */
+	private static async handleNonStreamResponse(
+		response: Response,
+		onComplete?: (
+			response: string,
+			reasoningContent?: string,
+			timings?: ChatMessageTimings,
+			toolCalls?: string
+		) => void,
+		onError?: (error: Error) => void,
+		onToolCallChunk?: (chunk: string) => void,
+		onModel?: (model: string) => void
+	): Promise<string> {
+		try {
+			const responseText = await response.text();
+
+			if (!responseText.trim()) {
+				const noResponseError = new Error('No response received from server. Please try again.');
+				throw noResponseError;
+			}
+
+			const data: ApiChatCompletionResponse = JSON.parse(responseText);
+
+			const responseModel = ChatService.extractModelName(data);
+			if (responseModel) {
+				onModel?.(responseModel);
+			}
+
+			const content = data.choices[0]?.message?.content || '';
+			const reasoningContent = data.choices[0]?.message?.reasoning_content;
+			const toolCalls = data.choices[0]?.message?.tool_calls;
+
+			if (reasoningContent) {
+				console.log('Full reasoning content:', reasoningContent);
+			}
+
+			let serializedToolCalls: string | undefined;
+
+			if (toolCalls && toolCalls.length > 0) {
+				const mergedToolCalls = ChatService.mergeToolCallDeltas([], toolCalls);
+
+				if (mergedToolCalls.length > 0) {
+					serializedToolCalls = JSON.stringify(mergedToolCalls);
+					if (serializedToolCalls) {
+						onToolCallChunk?.(serializedToolCalls);
+					}
+				}
+			}
+
+			if (!content.trim() && !serializedToolCalls) {
+				const noResponseError = new Error('No response received from server. Please try again.');
+				throw noResponseError;
+			}
+
+			onComplete?.(content, reasoningContent, undefined, serializedToolCalls);
+
+			return content;
+		} catch (error) {
+			const err = error instanceof Error ? error : new Error('Parse error');
+
+			onError?.(err);
+
+			throw err;
+		}
+	}
+
+	/**
+	 * Merges tool call deltas into an existing array of tool calls.
+	 * Handles both existing and new tool calls, updating existing ones and adding new ones.
+	 *
+	 * @param existing - The existing array of tool calls to merge into
+	 * @param deltas - The array of tool call deltas to merge
+	 * @param indexOffset - Optional offset to apply to the index of new tool calls
+	 * @returns {ApiChatCompletionToolCall[]} The merged array of tool calls
+	 */
+	private static mergeToolCallDeltas(
+		existing: ApiChatCompletionToolCall[],
+		deltas: ApiChatCompletionToolCallDelta[],
+		indexOffset = 0
+	): ApiChatCompletionToolCall[] {
+		const result = existing.map((call) => ({
+			...call,
+			function: call.function ? { ...call.function } : undefined
+		}));
+
+		for (const delta of deltas) {
+			const index =
+				typeof delta.index === 'number' && delta.index >= 0
+					? delta.index + indexOffset
+					: result.length;
+
+			while (result.length <= index) {
+				result.push({ function: undefined });
+			}
+
+			const target = result[index]!;
+
+			if (delta.id) {
+				target.id = delta.id;
+			}
+
+			if (delta.type) {
+				target.type = delta.type;
+			}
+
+			if (delta.function) {
+				const fn = target.function ? { ...target.function } : {};
+
+				if (delta.function.name) {
+					fn.name = delta.function.name;
+				}
+
+				if (delta.function.arguments) {
+					fn.arguments = (fn.arguments ?? '') + delta.function.arguments;
+				}
+
+				target.function = fn;
+			}
+		}
+
+		return result;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Conversion
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Converts a database message with attachments to API chat message format.
+	 * Processes various attachment types (images, text files, PDFs) and formats them
+	 * as content parts suitable for the chat completion API.
+	 *
+	 * @param message - Database message object with optional extra attachments
+	 * @param message.content - The text content of the message
+	 * @param message.role - The role of the message sender (user, assistant, system)
+	 * @param message.extra - Optional array of message attachments (images, files, etc.)
+	 * @returns {ApiChatMessageData} object formatted for the chat completion API
+	 * @static
+	 */
+	static convertDbMessageToApiChatMessageData(
+		message: DatabaseMessage & { extra?: DatabaseMessageExtra[] }
+	): ApiChatMessageData {
+		if (!message.extra || message.extra.length === 0) {
+			return {
+				role: message.role as 'user' | 'assistant' | 'system',
+				content: message.content
+			};
+		}
+
+		const contentParts: ApiChatMessageContentPart[] = [];
+
+		if (message.content) {
+			contentParts.push({
+				type: 'text',
+				text: message.content
+			});
+		}
+
+		const imageFiles = message.extra.filter(
+			(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraImageFile =>
+				extra.type === AttachmentType.IMAGE
+		);
+
+		for (const image of imageFiles) {
+			contentParts.push({
+				type: 'image_url',
+				image_url: { url: image.base64Url }
+			});
+		}
+
+		const textFiles = message.extra.filter(
+			(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraTextFile =>
+				extra.type === AttachmentType.TEXT
+		);
+
+		for (const textFile of textFiles) {
+			contentParts.push({
+				type: 'text',
+				text: `\n\n--- File: ${textFile.name} ---\n${textFile.content}`
+			});
+		}
+
+		// Handle legacy 'context' type from old webui (pasted content)
+		const legacyContextFiles = message.extra.filter(
+			(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraLegacyContext =>
+				extra.type === AttachmentType.LEGACY_CONTEXT
+		);
+
+		for (const legacyContextFile of legacyContextFiles) {
+			contentParts.push({
+				type: 'text',
+				text: `\n\n--- File: ${legacyContextFile.name} ---\n${legacyContextFile.content}`
+			});
+		}
+
+		const audioFiles = message.extra.filter(
+			(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraAudioFile =>
+				extra.type === AttachmentType.AUDIO
+		);
+
+		for (const audio of audioFiles) {
+			contentParts.push({
+				type: 'input_audio',
+				input_audio: {
+					data: audio.base64Data,
+					format: audio.mimeType.includes('wav') ? 'wav' : 'mp3'
+				}
+			});
+		}
+
+		const pdfFiles = message.extra.filter(
+			(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraPdfFile =>
+				extra.type === AttachmentType.PDF
+		);
+
+		for (const pdfFile of pdfFiles) {
+			if (pdfFile.processedAsImages && pdfFile.images) {
+				for (let i = 0; i < pdfFile.images.length; i++) {
+					contentParts.push({
+						type: 'image_url',
+						image_url: { url: pdfFile.images[i] }
+					});
+				}
+			} else {
+				contentParts.push({
+					type: 'text',
+					text: `\n\n--- PDF File: ${pdfFile.name} ---\n${pdfFile.content}`
+				});
+			}
+		}
+
+		return {
+			role: message.role as 'user' | 'assistant' | 'system',
+			content: contentParts
+		};
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Utilities
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Parses error response and creates appropriate error with context information
+	 * @param response - HTTP response object
+	 * @returns Promise<Error> - Parsed error with context info if available
+	 */
+	private static async parseErrorResponse(
+		response: Response
+	): Promise<Error & { contextInfo?: { n_prompt_tokens: number; n_ctx: number } }> {
+		try {
+			const errorText = await response.text();
+			const errorData: ApiErrorResponse = JSON.parse(errorText);
+
+			const message = errorData.error?.message || 'Unknown server error';
+			const error = new Error(message) as Error & {
+				contextInfo?: { n_prompt_tokens: number; n_ctx: number };
+			};
+			error.name = response.status === 400 ? 'ServerError' : 'HttpError';
+
+			if (errorData.error && 'n_prompt_tokens' in errorData.error && 'n_ctx' in errorData.error) {
+				error.contextInfo = {
+					n_prompt_tokens: errorData.error.n_prompt_tokens,
+					n_ctx: errorData.error.n_ctx
+				};
+			}
+
+			return error;
+		} catch {
+			const fallback = new Error(
+				`Server error (${response.status}): ${response.statusText}`
+			) as Error & {
+				contextInfo?: { n_prompt_tokens: number; n_ctx: number };
+			};
+			fallback.name = 'HttpError';
+			return fallback;
+		}
+	}
+
+	/**
+	 * Extracts model name from Chat Completions API response data.
+	 * Handles various response formats including streaming chunks and final responses.
+	 *
+	 * WORKAROUND: In single model mode, llama-server returns a default/incorrect model name
+	 * in the response. We override it with the actual model name from serverStore.
+	 *
+	 * @param data - Raw response data from the Chat Completions API
+	 * @returns Model name string if found, undefined otherwise
+	 * @private
+	 */
+	private static extractModelName(data: unknown): string | undefined {
+		const asRecord = (value: unknown): Record<string, unknown> | undefined => {
+			return typeof value === 'object' && value !== null
+				? (value as Record<string, unknown>)
+				: undefined;
+		};
+
+		const getTrimmedString = (value: unknown): string | undefined => {
+			return typeof value === 'string' && value.trim() ? value.trim() : undefined;
+		};
+
+		const root = asRecord(data);
+		if (!root) return undefined;
+
+		// 1) root (some implementations provide `model` at the top level)
+		const rootModel = getTrimmedString(root.model);
+		if (rootModel) return rootModel;
+
+		// 2) streaming choice (delta) or final response (message)
+		const firstChoice = Array.isArray(root.choices) ? asRecord(root.choices[0]) : undefined;
+		if (!firstChoice) return undefined;
+
+		// priority: delta.model (first chunk) else message.model (final response)
+		const deltaModel = getTrimmedString(asRecord(firstChoice.delta)?.model);
+		if (deltaModel) return deltaModel;
+
+		const messageModel = getTrimmedString(asRecord(firstChoice.message)?.model);
+		if (messageModel) return messageModel;
+
+		// avoid guessing from non-standard locations (metadata, etc.)
+		return undefined;
+	}
+
+	/**
+	 * Calls the onTimings callback with timing data from streaming response.
+	 *
+	 * @param timings - Timing information from the Chat Completions API response
+	 * @param promptProgress - Prompt processing progress data
+	 * @param onTimingsCallback - Callback function to invoke with timing data
+	 * @private
+	 */
+	private static notifyTimings(
+		timings: ChatMessageTimings | undefined,
+		promptProgress: ChatMessagePromptProgress | undefined,
+		onTimingsCallback:
+			| ((timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void)
+			| undefined
+	): void {
+		if (!onTimingsCallback || (!timings && !promptProgress)) return;
+
+		onTimingsCallback(timings, promptProgress);
+	}
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/services/database.ts b/llama.cpp/tools/server/webui/src/lib/services/database.ts
new file mode 100644
index 0000000..3b24628
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/services/database.ts
@@ -0,0 +1,400 @@
+import Dexie, { type EntityTable } from 'dexie';
+import { findDescendantMessages } from '$lib/utils';
+
+class LlamacppDatabase extends Dexie {
+	conversations!: EntityTable<DatabaseConversation, string>;
+	messages!: EntityTable<DatabaseMessage, string>;
+
+	constructor() {
+		super('LlamacppWebui');
+
+		this.version(1).stores({
+			conversations: 'id, lastModified, currNode, name',
+			messages: 'id, convId, type, role, timestamp, parent, children'
+		});
+	}
+}
+
+const db = new LlamacppDatabase();
+import { v4 as uuid } from 'uuid';
+
+/**
+ * DatabaseService - Stateless IndexedDB communication layer
+ *
+ * **Terminology - Chat vs Conversation:**
+ * - **Chat**: The active interaction space with the Chat Completions API (ephemeral, runtime).
+ * - **Conversation**: The persistent database entity storing all messages and metadata.
+ *   This service handles raw database operations for conversations - the lowest layer
+ *   in the persistence stack.
+ *
+ * This service provides a stateless data access layer built on IndexedDB using Dexie ORM.
+ * It handles all low-level storage operations for conversations and messages with support
+ * for complex branching and message threading. All methods are static - no instance state.
+ *
+ * **Architecture & Relationships (bottom to top):**
+ * - **DatabaseService** (this class): Stateless IndexedDB operations
+ *   - Lowest layer - direct Dexie/IndexedDB communication
+ *   - Pure CRUD operations without business logic
+ *   - Handles branching tree structure (parent-child relationships)
+ *   - Provides transaction safety for multi-table operations
+ *
+ * - **ConversationsService**: Stateless business logic layer
+ *   - Uses DatabaseService for all persistence operations
+ *   - Adds import/export, navigation, and higher-level operations
+ *
+ * - **conversationsStore**: Reactive state management for conversations
+ *   - Uses ConversationsService for database operations
+ *   - Manages conversation list, active conversation, and messages in memory
+ *
+ * - **chatStore**: Active AI interaction management
+ *   - Uses conversationsStore for conversation context
+ *   - Directly uses DatabaseService for message CRUD during streaming
+ *
+ * **Key Features:**
+ * - **Conversation CRUD**: Create, read, update, delete conversations
+ * - **Message CRUD**: Add, update, delete messages with branching support
+ * - **Branch Operations**: Create branches, find descendants, cascade deletions
+ * - **Transaction Safety**: Atomic operations for data consistency
+ *
+ * **Database Schema:**
+ * - `conversations`: id, lastModified, currNode, name
+ * - `messages`: id, convId, type, role, timestamp, parent, children
+ *
+ * **Branching Model:**
+ * Messages form a tree structure where each message can have multiple children,
+ * enabling conversation branching and alternative response paths. The conversation's
+ * `currNode` tracks the currently active branch endpoint.
+ */
+export class DatabaseService {
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Conversations
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Creates a new conversation.
+	 *
+	 * @param name - Name of the conversation
+	 * @returns The created conversation
+	 */
+	static async createConversation(name: string): Promise<DatabaseConversation> {
+		const conversation: DatabaseConversation = {
+			id: uuid(),
+			name,
+			lastModified: Date.now(),
+			currNode: ''
+		};
+
+		await db.conversations.add(conversation);
+		return conversation;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Messages
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Creates a new message branch by adding a message and updating parent/child relationships.
+	 * Also updates the conversation's currNode to point to the new message.
+	 *
+	 * @param message - Message to add (without id)
+	 * @param parentId - Parent message ID to attach to
+	 * @returns The created message
+	 */
+	static async createMessageBranch(
+		message: Omit<DatabaseMessage, 'id'>,
+		parentId: string | null
+	): Promise<DatabaseMessage> {
+		return await db.transaction('rw', [db.conversations, db.messages], async () => {
+			// Handle null parent (root message case)
+			if (parentId !== null) {
+				const parentMessage = await db.messages.get(parentId);
+				if (!parentMessage) {
+					throw new Error(`Parent message ${parentId} not found`);
+				}
+			}
+
+			const newMessage: DatabaseMessage = {
+				...message,
+				id: uuid(),
+				parent: parentId,
+				toolCalls: message.toolCalls ?? '',
+				children: []
+			};
+
+			await db.messages.add(newMessage);
+
+			// Update parent's children array if parent exists
+			if (parentId !== null) {
+				const parentMessage = await db.messages.get(parentId);
+				if (parentMessage) {
+					await db.messages.update(parentId, {
+						children: [...parentMessage.children, newMessage.id]
+					});
+				}
+			}
+
+			await this.updateConversation(message.convId, {
+				currNode: newMessage.id
+			});
+
+			return newMessage;
+		});
+	}
+
+	/**
+	 * Creates a root message for a new conversation.
+	 * Root messages are not displayed but serve as the tree root for branching.
+	 *
+	 * @param convId - Conversation ID
+	 * @returns The created root message
+	 */
+	static async createRootMessage(convId: string): Promise<string> {
+		const rootMessage: DatabaseMessage = {
+			id: uuid(),
+			convId,
+			type: 'root',
+			timestamp: Date.now(),
+			role: 'system',
+			content: '',
+			parent: null,
+			thinking: '',
+			toolCalls: '',
+			children: []
+		};
+
+		await db.messages.add(rootMessage);
+		return rootMessage.id;
+	}
+
+	/**
+	 * Creates a system prompt message for a conversation.
+	 *
+	 * @param convId - Conversation ID
+	 * @param systemPrompt - The system prompt content (must be non-empty)
+	 * @param parentId - Parent message ID (typically the root message)
+	 * @returns The created system message
+	 * @throws Error if systemPrompt is empty
+	 */
+	static async createSystemMessage(
+		convId: string,
+		systemPrompt: string,
+		parentId: string
+	): Promise<DatabaseMessage> {
+		const trimmedPrompt = systemPrompt.trim();
+		if (!trimmedPrompt) {
+			throw new Error('Cannot create system message with empty content');
+		}
+
+		const systemMessage: DatabaseMessage = {
+			id: uuid(),
+			convId,
+			type: 'system',
+			timestamp: Date.now(),
+			role: 'system',
+			content: trimmedPrompt,
+			parent: parentId,
+			thinking: '',
+			children: []
+		};
+
+		await db.messages.add(systemMessage);
+
+		const parentMessage = await db.messages.get(parentId);
+		if (parentMessage) {
+			await db.messages.update(parentId, {
+				children: [...parentMessage.children, systemMessage.id]
+			});
+		}
+
+		return systemMessage;
+	}
+
+	/**
+	 * Deletes a conversation and all its messages.
+	 *
+	 * @param id - Conversation ID
+	 */
+	static async deleteConversation(id: string): Promise<void> {
+		await db.transaction('rw', [db.conversations, db.messages], async () => {
+			await db.conversations.delete(id);
+			await db.messages.where('convId').equals(id).delete();
+		});
+	}
+
+	/**
+	 * Deletes a message and removes it from its parent's children array.
+	 *
+	 * @param messageId - ID of the message to delete
+	 */
+	static async deleteMessage(messageId: string): Promise<void> {
+		await db.transaction('rw', db.messages, async () => {
+			const message = await db.messages.get(messageId);
+			if (!message) return;
+
+			// Remove this message from its parent's children array
+			if (message.parent) {
+				const parent = await db.messages.get(message.parent);
+				if (parent) {
+					parent.children = parent.children.filter((childId: string) => childId !== messageId);
+					await db.messages.put(parent);
+				}
+			}
+
+			// Delete the message
+			await db.messages.delete(messageId);
+		});
+	}
+
+	/**
+	 * Deletes a message and all its descendant messages (cascading deletion).
+	 * This removes the entire branch starting from the specified message.
+	 *
+	 * @param conversationId - ID of the conversation containing the message
+	 * @param messageId - ID of the root message to delete (along with all descendants)
+	 * @returns Array of all deleted message IDs
+	 */
+	static async deleteMessageCascading(
+		conversationId: string,
+		messageId: string
+	): Promise<string[]> {
+		return await db.transaction('rw', db.messages, async () => {
+			// Get all messages in the conversation to find descendants
+			const allMessages = await db.messages.where('convId').equals(conversationId).toArray();
+
+			// Find all descendant messages
+			const descendants = findDescendantMessages(allMessages, messageId);
+			const allToDelete = [messageId, ...descendants];
+
+			// Get the message to delete for parent cleanup
+			const message = await db.messages.get(messageId);
+			if (message && message.parent) {
+				const parent = await db.messages.get(message.parent);
+				if (parent) {
+					parent.children = parent.children.filter((childId: string) => childId !== messageId);
+					await db.messages.put(parent);
+				}
+			}
+
+			// Delete all messages in the branch
+			await db.messages.bulkDelete(allToDelete);
+
+			return allToDelete;
+		});
+	}
+
+	/**
+	 * Gets all conversations, sorted by last modified time (newest first).
+	 *
+	 * @returns Array of conversations
+	 */
+	static async getAllConversations(): Promise<DatabaseConversation[]> {
+		return await db.conversations.orderBy('lastModified').reverse().toArray();
+	}
+
+	/**
+	 * Gets a conversation by ID.
+	 *
+	 * @param id - Conversation ID
+	 * @returns The conversation if found, otherwise undefined
+	 */
+	static async getConversation(id: string): Promise<DatabaseConversation | undefined> {
+		return await db.conversations.get(id);
+	}
+
+	/**
+	 * Gets all messages in a conversation, sorted by timestamp (oldest first).
+	 *
+	 * @param convId - Conversation ID
+	 * @returns Array of messages in the conversation
+	 */
+	static async getConversationMessages(convId: string): Promise<DatabaseMessage[]> {
+		return await db.messages.where('convId').equals(convId).sortBy('timestamp');
+	}
+
+	/**
+	 * Updates a conversation.
+	 *
+	 * @param id - Conversation ID
+	 * @param updates - Partial updates to apply
+	 * @returns Promise that resolves when the conversation is updated
+	 */
+	static async updateConversation(
+		id: string,
+		updates: Partial<Omit<DatabaseConversation, 'id'>>
+	): Promise<void> {
+		await db.conversations.update(id, {
+			...updates,
+			lastModified: Date.now()
+		});
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Navigation
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Updates the conversation's current node (active branch).
+	 * This determines which conversation path is currently being viewed.
+	 *
+	 * @param convId - Conversation ID
+	 * @param nodeId - Message ID to set as current node
+	 */
+	static async updateCurrentNode(convId: string, nodeId: string): Promise<void> {
+		await this.updateConversation(convId, {
+			currNode: nodeId
+		});
+	}
+
+	/**
+	 * Updates a message.
+	 *
+	 * @param id - Message ID
+	 * @param updates - Partial updates to apply
+	 * @returns Promise that resolves when the message is updated
+	 */
+	static async updateMessage(
+		id: string,
+		updates: Partial<Omit<DatabaseMessage, 'id'>>
+	): Promise<void> {
+		await db.messages.update(id, updates);
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Import
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Imports multiple conversations and their messages.
+	 * Skips conversations that already exist.
+	 *
+	 * @param data - Array of { conv, messages } objects
+	 */
+	static async importConversations(
+		data: { conv: DatabaseConversation; messages: DatabaseMessage[] }[]
+	): Promise<{ imported: number; skipped: number }> {
+		let importedCount = 0;
+		let skippedCount = 0;
+
+		return await db.transaction('rw', [db.conversations, db.messages], async () => {
+			for (const item of data) {
+				const { conv, messages } = item;
+
+				const existing = await db.conversations.get(conv.id);
+				if (existing) {
+					console.warn(`Conversation "${conv.name}" already exists, skipping...`);
+					skippedCount++;
+					continue;
+				}
+
+				await db.conversations.add(conv);
+				for (const msg of messages) {
+					await db.messages.put(msg);
+				}
+
+				importedCount++;
+			}
+
+			return { imported: importedCount, skipped: skippedCount };
+		});
+	}
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/services/index.ts b/llama.cpp/tools/server/webui/src/lib/services/index.ts
new file mode 100644
index 0000000..c36c64a
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/services/index.ts
@@ -0,0 +1,5 @@
+export { ChatService } from './chat';
+export { DatabaseService } from './database';
+export { ModelsService } from './models';
+export { PropsService } from './props';
+export { ParameterSyncService } from './parameter-sync';
diff --git a/llama.cpp/tools/server/webui/src/lib/services/models.ts b/llama.cpp/tools/server/webui/src/lib/services/models.ts
new file mode 100644
index 0000000..eecb7fa
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/services/models.ts
@@ -0,0 +1,124 @@
+import { base } from '$app/paths';
+import { ServerModelStatus } from '$lib/enums';
+import { getJsonHeaders } from '$lib/utils';
+
+/**
+ * ModelsService - Stateless service for model management API communication
+ *
+ * This service handles communication with model-related endpoints:
+ * - `/v1/models` - OpenAI-compatible model list (MODEL + ROUTER mode)
+ * - `/models/load`, `/models/unload` - Router-specific model management (ROUTER mode only)
+ *
+ * **Responsibilities:**
+ * - List available models
+ * - Load/unload models (ROUTER mode)
+ * - Check model status (ROUTER mode)
+ *
+ * **Used by:**
+ * - modelsStore: Primary consumer for model state management
+ */
+export class ModelsService {
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Listing
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Fetch list of models from OpenAI-compatible endpoint
+	 * Works in both MODEL and ROUTER modes
+	 */
+	static async list(): Promise<ApiModelListResponse> {
+		const response = await fetch(`${base}/v1/models`, {
+			headers: getJsonHeaders()
+		});
+
+		if (!response.ok) {
+			throw new Error(`Failed to fetch model list (status ${response.status})`);
+		}
+
+		return response.json() as Promise<ApiModelListResponse>;
+	}
+
+	/**
+	 * Fetch list of all models with detailed metadata (ROUTER mode)
+	 * Returns models with load status, paths, and other metadata
+	 */
+	static async listRouter(): Promise<ApiRouterModelsListResponse> {
+		const response = await fetch(`${base}/v1/models`, {
+			headers: getJsonHeaders()
+		});
+
+		if (!response.ok) {
+			throw new Error(`Failed to fetch router models list (status ${response.status})`);
+		}
+
+		return response.json() as Promise<ApiRouterModelsListResponse>;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Load/Unload
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Load a model (ROUTER mode)
+	 * POST /models/load
+	 * @param modelId - Model identifier to load
+	 * @param extraArgs - Optional additional arguments to pass to the model instance
+	 */
+	static async load(modelId: string, extraArgs?: string[]): Promise<ApiRouterModelsLoadResponse> {
+		const payload: { model: string; extra_args?: string[] } = { model: modelId };
+		if (extraArgs && extraArgs.length > 0) {
+			payload.extra_args = extraArgs;
+		}
+
+		const response = await fetch(`${base}/models/load`, {
+			method: 'POST',
+			headers: getJsonHeaders(),
+			body: JSON.stringify(payload)
+		});
+
+		if (!response.ok) {
+			const errorData = await response.json().catch(() => ({}));
+			throw new Error(errorData.error || `Failed to load model (status ${response.status})`);
+		}
+
+		return response.json() as Promise<ApiRouterModelsLoadResponse>;
+	}
+
+	/**
+	 * Unload a model (ROUTER mode)
+	 * POST /models/unload
+	 * @param modelId - Model identifier to unload
+	 */
+	static async unload(modelId: string): Promise<ApiRouterModelsUnloadResponse> {
+		const response = await fetch(`${base}/models/unload`, {
+			method: 'POST',
+			headers: getJsonHeaders(),
+			body: JSON.stringify({ model: modelId })
+		});
+
+		if (!response.ok) {
+			const errorData = await response.json().catch(() => ({}));
+			throw new Error(errorData.error || `Failed to unload model (status ${response.status})`);
+		}
+
+		return response.json() as Promise<ApiRouterModelsUnloadResponse>;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Status
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Check if a model is loaded based on its metadata
+	 */
+	static isModelLoaded(model: ApiModelDataEntry): boolean {
+		return model.status.value === ServerModelStatus.LOADED;
+	}
+
+	/**
+	 * Check if a model is currently loading
+	 */
+	static isModelLoading(model: ApiModelDataEntry): boolean {
+		return model.status.value === ServerModelStatus.LOADING;
+	}
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.spec.ts b/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.spec.ts
new file mode 100644
index 0000000..6b5c58a
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.spec.ts
@@ -0,0 +1,148 @@
+import { describe, it, expect } from 'vitest';
+import { ParameterSyncService } from './parameter-sync';
+
+describe('ParameterSyncService', () => {
+	describe('roundFloatingPoint', () => {
+		it('should fix JavaScript floating-point precision issues', () => {
+			// Test the specific values from the screenshot
+			const mockServerParams = {
+				top_p: 0.949999988079071,
+				min_p: 0.009999999776482582,
+				temperature: 0.800000011920929,
+				top_k: 40,
+				samplers: ['top_k', 'typ_p', 'top_p', 'min_p', 'temperature']
+			};
+
+			const result = ParameterSyncService.extractServerDefaults({
+				...mockServerParams,
+				// Add other required fields to match the API type
+				n_predict: 512,
+				seed: -1,
+				dynatemp_range: 0.0,
+				dynatemp_exponent: 1.0,
+				xtc_probability: 0.0,
+				xtc_threshold: 0.1,
+				typ_p: 1.0,
+				repeat_last_n: 64,
+				repeat_penalty: 1.0,
+				presence_penalty: 0.0,
+				frequency_penalty: 0.0,
+				dry_multiplier: 0.0,
+				dry_base: 1.75,
+				dry_allowed_length: 2,
+				dry_penalty_last_n: -1,
+				mirostat: 0,
+				mirostat_tau: 5.0,
+				mirostat_eta: 0.1,
+				stop: [],
+				max_tokens: -1,
+				n_keep: 0,
+				n_discard: 0,
+				ignore_eos: false,
+				stream: true,
+				logit_bias: [],
+				n_probs: 0,
+				min_keep: 0,
+				grammar: '',
+				grammar_lazy: false,
+				grammar_triggers: [],
+				preserved_tokens: [],
+				chat_format: '',
+				reasoning_format: '',
+				reasoning_in_content: false,
+				thinking_forced_open: false,
+				'speculative.n_max': 0,
+				'speculative.n_min': 0,
+				'speculative.p_min': 0.0,
+				timings_per_token: false,
+				post_sampling_probs: false,
+				lora: [],
+				top_n_sigma: 0.0,
+				dry_sequence_breakers: []
+			} as ApiLlamaCppServerProps['default_generation_settings']['params']);
+
+			// Check that the problematic floating-point values are rounded correctly
+			expect(result.top_p).toBe(0.95);
+			expect(result.min_p).toBe(0.01);
+			expect(result.temperature).toBe(0.8);
+			expect(result.top_k).toBe(40); // Integer should remain unchanged
+			expect(result.samplers).toBe('top_k;typ_p;top_p;min_p;temperature');
+		});
+
+		it('should preserve non-numeric values', () => {
+			const mockServerParams = {
+				samplers: ['top_k', 'temperature'],
+				max_tokens: -1,
+				temperature: 0.7
+			};
+
+			const result = ParameterSyncService.extractServerDefaults({
+				...mockServerParams,
+				// Minimal required fields
+				n_predict: 512,
+				seed: -1,
+				dynatemp_range: 0.0,
+				dynatemp_exponent: 1.0,
+				top_k: 40,
+				top_p: 0.95,
+				min_p: 0.05,
+				xtc_probability: 0.0,
+				xtc_threshold: 0.1,
+				typ_p: 1.0,
+				repeat_last_n: 64,
+				repeat_penalty: 1.0,
+				presence_penalty: 0.0,
+				frequency_penalty: 0.0,
+				dry_multiplier: 0.0,
+				dry_base: 1.75,
+				dry_allowed_length: 2,
+				dry_penalty_last_n: -1,
+				mirostat: 0,
+				mirostat_tau: 5.0,
+				mirostat_eta: 0.1,
+				stop: [],
+				n_keep: 0,
+				n_discard: 0,
+				ignore_eos: false,
+				stream: true,
+				logit_bias: [],
+				n_probs: 0,
+				min_keep: 0,
+				grammar: '',
+				grammar_lazy: false,
+				grammar_triggers: [],
+				preserved_tokens: [],
+				chat_format: '',
+				reasoning_format: '',
+				reasoning_in_content: false,
+				thinking_forced_open: false,
+				'speculative.n_max': 0,
+				'speculative.n_min': 0,
+				'speculative.p_min': 0.0,
+				timings_per_token: false,
+				post_sampling_probs: false,
+				lora: [],
+				top_n_sigma: 0.0,
+				dry_sequence_breakers: []
+			} as ApiLlamaCppServerProps['default_generation_settings']['params']);
+
+			expect(result.samplers).toBe('top_k;temperature');
+			expect(result.max_tokens).toBe(-1);
+			expect(result.temperature).toBe(0.7);
+		});
+
+		it('should merge webui settings from props when provided', () => {
+			const result = ParameterSyncService.extractServerDefaults(null, {
+				pasteLongTextToFileLen: 0,
+				pdfAsImage: true,
+				renderUserContentAsMarkdown: false,
+				theme: 'dark'
+			});
+
+			expect(result.pasteLongTextToFileLen).toBe(0);
+			expect(result.pdfAsImage).toBe(true);
+			expect(result.renderUserContentAsMarkdown).toBe(false);
+			expect(result.theme).toBeUndefined();
+		});
+	});
+});
diff --git a/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.ts b/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.ts
new file mode 100644
index 0000000..d124cf5
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.ts
@@ -0,0 +1,279 @@
+/**
+ * ParameterSyncService - Handles synchronization between server defaults and user settings
+ *
+ * This service manages the complex logic of merging server-provided default parameters
+ * with user-configured overrides, ensuring the UI reflects the actual server state
+ * while preserving user customizations.
+ *
+ * **Key Responsibilities:**
+ * - Extract syncable parameters from server props
+ * - Merge server defaults with user overrides
+ * - Track parameter sources (server, user, default)
+ * - Provide sync utilities for settings store integration
+ */
+
+import { normalizeFloatingPoint } from '$lib/utils';
+
+export type ParameterSource = 'default' | 'custom';
+export type ParameterValue = string | number | boolean;
+export type ParameterRecord = Record<string, ParameterValue>;
+
+export interface ParameterInfo {
+	value: string | number | boolean;
+	source: ParameterSource;
+	serverDefault?: string | number | boolean;
+	userOverride?: string | number | boolean;
+}
+
+export interface SyncableParameter {
+	key: string;
+	serverKey: string;
+	type: 'number' | 'string' | 'boolean';
+	canSync: boolean;
+}
+
+/**
+ * Mapping of webui setting keys to server parameter keys
+ * Only parameters that should be synced from server are included
+ */
+export const SYNCABLE_PARAMETERS: SyncableParameter[] = [
+	{ key: 'temperature', serverKey: 'temperature', type: 'number', canSync: true },
+	{ key: 'top_k', serverKey: 'top_k', type: 'number', canSync: true },
+	{ key: 'top_p', serverKey: 'top_p', type: 'number', canSync: true },
+	{ key: 'min_p', serverKey: 'min_p', type: 'number', canSync: true },
+	{ key: 'dynatemp_range', serverKey: 'dynatemp_range', type: 'number', canSync: true },
+	{ key: 'dynatemp_exponent', serverKey: 'dynatemp_exponent', type: 'number', canSync: true },
+	{ key: 'xtc_probability', serverKey: 'xtc_probability', type: 'number', canSync: true },
+	{ key: 'xtc_threshold', serverKey: 'xtc_threshold', type: 'number', canSync: true },
+	{ key: 'typ_p', serverKey: 'typ_p', type: 'number', canSync: true },
+	{ key: 'repeat_last_n', serverKey: 'repeat_last_n', type: 'number', canSync: true },
+	{ key: 'repeat_penalty', serverKey: 'repeat_penalty', type: 'number', canSync: true },
+	{ key: 'presence_penalty', serverKey: 'presence_penalty', type: 'number', canSync: true },
+	{ key: 'frequency_penalty', serverKey: 'frequency_penalty', type: 'number', canSync: true },
+	{ key: 'dry_multiplier', serverKey: 'dry_multiplier', type: 'number', canSync: true },
+	{ key: 'dry_base', serverKey: 'dry_base', type: 'number', canSync: true },
+	{ key: 'dry_allowed_length', serverKey: 'dry_allowed_length', type: 'number', canSync: true },
+	{ key: 'dry_penalty_last_n', serverKey: 'dry_penalty_last_n', type: 'number', canSync: true },
+	{ key: 'max_tokens', serverKey: 'max_tokens', type: 'number', canSync: true },
+	{ key: 'samplers', serverKey: 'samplers', type: 'string', canSync: true },
+	{
+		key: 'pasteLongTextToFileLen',
+		serverKey: 'pasteLongTextToFileLen',
+		type: 'number',
+		canSync: true
+	},
+	{ key: 'pdfAsImage', serverKey: 'pdfAsImage', type: 'boolean', canSync: true },
+	{
+		key: 'showThoughtInProgress',
+		serverKey: 'showThoughtInProgress',
+		type: 'boolean',
+		canSync: true
+	},
+	{ key: 'showToolCalls', serverKey: 'showToolCalls', type: 'boolean', canSync: true },
+	{
+		key: 'disableReasoningFormat',
+		serverKey: 'disableReasoningFormat',
+		type: 'boolean',
+		canSync: true
+	},
+	{ key: 'keepStatsVisible', serverKey: 'keepStatsVisible', type: 'boolean', canSync: true },
+	{ key: 'showMessageStats', serverKey: 'showMessageStats', type: 'boolean', canSync: true },
+	{
+		key: 'askForTitleConfirmation',
+		serverKey: 'askForTitleConfirmation',
+		type: 'boolean',
+		canSync: true
+	},
+	{ key: 'disableAutoScroll', serverKey: 'disableAutoScroll', type: 'boolean', canSync: true },
+	{
+		key: 'renderUserContentAsMarkdown',
+		serverKey: 'renderUserContentAsMarkdown',
+		type: 'boolean',
+		canSync: true
+	},
+	{ key: 'autoMicOnEmpty', serverKey: 'autoMicOnEmpty', type: 'boolean', canSync: true },
+	{
+		key: 'pyInterpreterEnabled',
+		serverKey: 'pyInterpreterEnabled',
+		type: 'boolean',
+		canSync: true
+	},
+	{
+		key: 'enableContinueGeneration',
+		serverKey: 'enableContinueGeneration',
+		type: 'boolean',
+		canSync: true
+	}
+];
+
+export class ParameterSyncService {
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Extraction
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Round floating-point numbers to avoid JavaScript precision issues
+	 */
+	private static roundFloatingPoint(value: ParameterValue): ParameterValue {
+		return normalizeFloatingPoint(value) as ParameterValue;
+	}
+
+	/**
+	 * Extract server default parameters that can be synced
+	 */
+	static extractServerDefaults(
+		serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null,
+		webuiSettings?: Record<string, string | number | boolean>
+	): ParameterRecord {
+		const extracted: ParameterRecord = {};
+
+		if (serverParams) {
+			for (const param of SYNCABLE_PARAMETERS) {
+				if (param.canSync && param.serverKey in serverParams) {
+					const value = (serverParams as unknown as Record<string, ParameterValue>)[
+						param.serverKey
+					];
+					if (value !== undefined) {
+						// Apply precision rounding to avoid JavaScript floating-point issues
+						extracted[param.key] = this.roundFloatingPoint(value);
+					}
+				}
+			}
+
+			// Handle samplers array conversion to string
+			if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
+				extracted.samplers = serverParams.samplers.join(';');
+			}
+		}
+
+		if (webuiSettings) {
+			for (const param of SYNCABLE_PARAMETERS) {
+				if (param.canSync && param.serverKey in webuiSettings) {
+					const value = webuiSettings[param.serverKey];
+					if (value !== undefined) {
+						extracted[param.key] = this.roundFloatingPoint(value);
+					}
+				}
+			}
+		}
+
+		return extracted;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Merging
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Merge server defaults with current user settings
+	 * Returns updated settings that respect user overrides while using server defaults
+	 */
+	static mergeWithServerDefaults(
+		currentSettings: ParameterRecord,
+		serverDefaults: ParameterRecord,
+		userOverrides: Set<string> = new Set()
+	): ParameterRecord {
+		const merged = { ...currentSettings };
+
+		for (const [key, serverValue] of Object.entries(serverDefaults)) {
+			// Only update if user hasn't explicitly overridden this parameter
+			if (!userOverrides.has(key)) {
+				merged[key] = this.roundFloatingPoint(serverValue);
+			}
+		}
+
+		return merged;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Info
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Get parameter information including source and values
+	 */
+	static getParameterInfo(
+		key: string,
+		currentValue: ParameterValue,
+		propsDefaults: ParameterRecord,
+		userOverrides: Set<string>
+	): ParameterInfo {
+		const hasPropsDefault = propsDefaults[key] !== undefined;
+		const isUserOverride = userOverrides.has(key);
+
+		// Simple logic: either using default (from props) or custom (user override)
+		const source: ParameterSource = isUserOverride ? 'custom' : 'default';
+
+		return {
+			value: currentValue,
+			source,
+			serverDefault: hasPropsDefault ? propsDefaults[key] : undefined, // Keep same field name for compatibility
+			userOverride: isUserOverride ? currentValue : undefined
+		};
+	}
+
+	/**
+	 * Check if a parameter can be synced from server
+	 */
+	static canSyncParameter(key: string): boolean {
+		return SYNCABLE_PARAMETERS.some((param) => param.key === key && param.canSync);
+	}
+
+	/**
+	 * Get all syncable parameter keys
+	 */
+	static getSyncableParameterKeys(): string[] {
+		return SYNCABLE_PARAMETERS.filter((param) => param.canSync).map((param) => param.key);
+	}
+
+	/**
+	 * Validate server parameter value
+	 */
+	static validateServerParameter(key: string, value: ParameterValue): boolean {
+		const param = SYNCABLE_PARAMETERS.find((p) => p.key === key);
+		if (!param) return false;
+
+		switch (param.type) {
+			case 'number':
+				return typeof value === 'number' && !isNaN(value);
+			case 'string':
+				return typeof value === 'string';
+			case 'boolean':
+				return typeof value === 'boolean';
+			default:
+				return false;
+		}
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Diff
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Create a diff between current settings and server defaults
+	 */
+	static createParameterDiff(
+		currentSettings: ParameterRecord,
+		serverDefaults: ParameterRecord
+	): Record<string, { current: ParameterValue; server: ParameterValue; differs: boolean }> {
+		const diff: Record<
+			string,
+			{ current: ParameterValue; server: ParameterValue; differs: boolean }
+		> = {};
+
+		for (const key of this.getSyncableParameterKeys()) {
+			const currentValue = currentSettings[key];
+			const serverValue = serverDefaults[key];
+
+			if (serverValue !== undefined) {
+				diff[key] = {
+					current: currentValue,
+					server: serverValue,
+					differs: currentValue !== serverValue
+				};
+			}
+		}
+
+		return diff;
+	}
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/services/props.ts b/llama.cpp/tools/server/webui/src/lib/services/props.ts
new file mode 100644
index 0000000..01fead9
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/services/props.ts
@@ -0,0 +1,77 @@
+import { getAuthHeaders } from '$lib/utils';
+
+/**
+ * PropsService - Server properties management
+ *
+ * This service handles communication with the /props endpoint to retrieve
+ * server configuration, model information, and capabilities.
+ *
+ * **Responsibilities:**
+ * - Fetch server properties from /props endpoint
+ * - Handle API authentication
+ * - Parse and validate server response
+ *
+ * **Used by:**
+ * - serverStore: Primary consumer for server state management
+ */
+export class PropsService {
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Fetching
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Fetches server properties from the /props endpoint
+	 *
+	 * @param autoload - If false, prevents automatic model loading (default: false)
+	 * @returns {Promise<ApiLlamaCppServerProps>} Server properties
+	 * @throws {Error} If the request fails or returns invalid data
+	 */
+	static async fetch(autoload = false): Promise<ApiLlamaCppServerProps> {
+		const url = new URL('./props', window.location.href);
+		if (!autoload) {
+			url.searchParams.set('autoload', 'false');
+		}
+
+		const response = await fetch(url.toString(), {
+			headers: getAuthHeaders()
+		});
+
+		if (!response.ok) {
+			throw new Error(
+				`Failed to fetch server properties: ${response.status} ${response.statusText}`
+			);
+		}
+
+		const data = await response.json();
+		return data as ApiLlamaCppServerProps;
+	}
+
+	/**
+	 * Fetches server properties for a specific model (ROUTER mode)
+	 *
+	 * @param modelId - The model ID to fetch properties for
+	 * @param autoload - If false, prevents automatic model loading (default: false)
+	 * @returns {Promise<ApiLlamaCppServerProps>} Server properties for the model
+	 * @throws {Error} If the request fails or returns invalid data
+	 */
+	static async fetchForModel(modelId: string, autoload = false): Promise<ApiLlamaCppServerProps> {
+		const url = new URL('./props', window.location.href);
+		url.searchParams.set('model', modelId);
+		if (!autoload) {
+			url.searchParams.set('autoload', 'false');
+		}
+
+		const response = await fetch(url.toString(), {
+			headers: getAuthHeaders()
+		});
+
+		if (!response.ok) {
+			throw new Error(
+				`Failed to fetch model properties: ${response.status} ${response.statusText}`
+			);
+		}
+
+		const data = await response.json();
+		return data as ApiLlamaCppServerProps;
+	}
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts b/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts
new file mode 100644
index 0000000..879b2f3
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -0,0 +1,1487 @@
+import { DatabaseService, ChatService } from '$lib/services';
+import { conversationsStore } from '$lib/stores/conversations.svelte';
+import { config } from '$lib/stores/settings.svelte';
+import { contextSize, isRouterMode } from '$lib/stores/server.svelte';
+import {
+	selectedModelName,
+	modelsStore,
+	selectedModelContextSize
+} from '$lib/stores/models.svelte';
+import {
+	normalizeModelName,
+	filterByLeafNodeId,
+	findDescendantMessages,
+	findLeafNode
+} from '$lib/utils';
+import { SvelteMap } from 'svelte/reactivity';
+import { DEFAULT_CONTEXT } from '$lib/constants/default-context';
+
+/**
+ * chatStore - Active AI interaction and streaming state management
+ *
+ * **Terminology - Chat vs Conversation:**
+ * - **Chat**: The active interaction space with the Chat Completions API. Represents the
+ *   real-time streaming session, loading states, and UI visualization of AI communication.
+ *   A "chat" is ephemeral - it exists only while the user is actively interacting with the AI.
+ * - **Conversation**: The persistent database entity storing all messages and metadata.
+ *   Managed by conversationsStore, conversations persist across sessions and page reloads.
+ *
+ * This store manages all active AI interactions including real-time streaming, response
+ * generation, and per-chat loading states. It handles the runtime layer between UI and
+ * AI backend, supporting concurrent streaming across multiple conversations.
+ *
+ * **Architecture & Relationships:**
+ * - **chatStore** (this class): Active AI session and streaming management
+ *   - Manages real-time AI response streaming via ChatService
+ *   - Tracks per-chat loading and streaming states for concurrent sessions
+ *   - Handles message operations (send, edit, regenerate, branch)
+ *   - Coordinates with conversationsStore for persistence
+ *
+ * - **conversationsStore**: Provides conversation data and message arrays for chat context
+ * - **ChatService**: Low-level API communication with llama.cpp server
+ * - **DatabaseService**: Message persistence and retrieval
+ *
+ * **Key Features:**
+ * - **AI Streaming**: Real-time token streaming with abort support
+ * - **Concurrent Chats**: Independent loading/streaming states per conversation
+ * - **Message Branching**: Edit, regenerate, and branch conversation trees
+ * - **Error Handling**: Timeout and server error recovery with user feedback
+ * - **Graceful Stop**: Save partial responses when stopping generation
+ *
+ * **State Management:**
+ * - Global `isLoading` and `currentResponse` for active chat UI
+ * - `chatLoadingStates` Map for per-conversation streaming tracking
+ * - `chatStreamingStates` Map for per-conversation streaming content
+ * - `processingStates` Map for per-conversation processing state (timing/context info)
+ * - Automatic state sync when switching between conversations
+ */
+class ChatStore {
+	// ─────────────────────────────────────────────────────────────────────────────
+	// State
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	activeProcessingState = $state<ApiProcessingState | null>(null);
+	currentResponse = $state('');
+	errorDialogState = $state<{
+		type: 'timeout' | 'server';
+		message: string;
+		contextInfo?: { n_prompt_tokens: number; n_ctx: number };
+	} | null>(null);
+	isLoading = $state(false);
+	chatLoadingStates = new SvelteMap<string, boolean>();
+	chatStreamingStates = new SvelteMap<string, { response: string; messageId: string }>();
+	private abortControllers = new SvelteMap<string, AbortController>();
+	private processingStates = new SvelteMap<string, ApiProcessingState | null>();
+	private activeConversationId = $state<string | null>(null);
+	private isStreamingActive = $state(false);
+	private isEditModeActive = $state(false);
+	private addFilesHandler: ((files: File[]) => void) | null = $state(null);
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Loading State
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	private setChatLoading(convId: string, loading: boolean): void {
+		if (loading) {
+			this.chatLoadingStates.set(convId, true);
+			if (conversationsStore.activeConversation?.id === convId) this.isLoading = true;
+		} else {
+			this.chatLoadingStates.delete(convId);
+			if (conversationsStore.activeConversation?.id === convId) this.isLoading = false;
+		}
+	}
+
+	private isChatLoading(convId: string): boolean {
+		return this.chatLoadingStates.get(convId) || false;
+	}
+
+	private setChatStreaming(convId: string, response: string, messageId: string): void {
+		this.chatStreamingStates.set(convId, { response, messageId });
+		if (conversationsStore.activeConversation?.id === convId) this.currentResponse = response;
+	}
+
+	private clearChatStreaming(convId: string): void {
+		this.chatStreamingStates.delete(convId);
+		if (conversationsStore.activeConversation?.id === convId) this.currentResponse = '';
+	}
+
+	private getChatStreaming(convId: string): { response: string; messageId: string } | undefined {
+		return this.chatStreamingStates.get(convId);
+	}
+
+	syncLoadingStateForChat(convId: string): void {
+		this.isLoading = this.isChatLoading(convId);
+		const streamingState = this.getChatStreaming(convId);
+		this.currentResponse = streamingState?.response || '';
+	}
+
+	/**
+	 * Clears global UI state without affecting background streaming.
+	 * Used when navigating to empty/new chat while other chats stream in background.
+	 */
+	clearUIState(): void {
+		this.isLoading = false;
+		this.currentResponse = '';
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Processing State
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Set the active conversation for statistics display
+	 */
+	setActiveProcessingConversation(conversationId: string | null): void {
+		this.activeConversationId = conversationId;
+
+		if (conversationId) {
+			this.activeProcessingState = this.processingStates.get(conversationId) || null;
+		} else {
+			this.activeProcessingState = null;
+		}
+	}
+
+	/**
+	 * Get processing state for a specific conversation
+	 */
+	getProcessingState(conversationId: string): ApiProcessingState | null {
+		return this.processingStates.get(conversationId) || null;
+	}
+
+	/**
+	 * Clear processing state for a specific conversation
+	 */
+	clearProcessingState(conversationId: string): void {
+		this.processingStates.delete(conversationId);
+
+		if (conversationId === this.activeConversationId) {
+			this.activeProcessingState = null;
+		}
+	}
+
+	/**
+	 * Get the current processing state for the active conversation (reactive)
+	 * Returns the direct reactive state for UI binding
+	 */
+	getActiveProcessingState(): ApiProcessingState | null {
+		return this.activeProcessingState;
+	}
+
+	/**
+	 * Updates processing state with timing data from streaming response
+	 */
+	updateProcessingStateFromTimings(
+		timingData: {
+			prompt_n: number;
+			prompt_ms?: number;
+			predicted_n: number;
+			predicted_per_second: number;
+			cache_n: number;
+			prompt_progress?: ChatMessagePromptProgress;
+		},
+		conversationId?: string
+	): void {
+		const processingState = this.parseTimingData(timingData);
+
+		if (processingState === null) {
+			console.warn('Failed to parse timing data - skipping update');
+			return;
+		}
+
+		const targetId = conversationId || this.activeConversationId;
+		if (targetId) {
+			this.processingStates.set(targetId, processingState);
+
+			if (targetId === this.activeConversationId) {
+				this.activeProcessingState = processingState;
+			}
+		}
+	}
+
+	/**
+	 * Get current processing state (sync version for reactive access)
+	 */
+	getCurrentProcessingStateSync(): ApiProcessingState | null {
+		return this.activeProcessingState;
+	}
+
+	/**
+	 * Restore processing state from last assistant message timings
+	 * Call this when keepStatsVisible is enabled and we need to show last known stats
+	 */
+	restoreProcessingStateFromMessages(messages: DatabaseMessage[], conversationId: string): void {
+		for (let i = messages.length - 1; i >= 0; i--) {
+			const message = messages[i];
+			if (message.role === 'assistant' && message.timings) {
+				const restoredState = this.parseTimingData({
+					prompt_n: message.timings.prompt_n || 0,
+					prompt_ms: message.timings.prompt_ms,
+					predicted_n: message.timings.predicted_n || 0,
+					predicted_per_second:
+						message.timings.predicted_n && message.timings.predicted_ms
+							? (message.timings.predicted_n / message.timings.predicted_ms) * 1000
+							: 0,
+					cache_n: message.timings.cache_n || 0
+				});
+
+				if (restoredState) {
+					this.processingStates.set(conversationId, restoredState);
+
+					if (conversationId === this.activeConversationId) {
+						this.activeProcessingState = restoredState;
+					}
+
+					return;
+				}
+			}
+		}
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Streaming
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Start streaming session tracking
+	 */
+	startStreaming(): void {
+		this.isStreamingActive = true;
+	}
+
+	/**
+	 * Stop streaming session tracking
+	 */
+	stopStreaming(): void {
+		this.isStreamingActive = false;
+	}
+
+	/**
+	 * Check if currently in a streaming session
+	 */
+	isStreaming(): boolean {
+		return this.isStreamingActive;
+	}
+
+	private getContextTotal(): number {
+		const activeState = this.getActiveProcessingState();
+
+		if (activeState && activeState.contextTotal > 0) {
+			return activeState.contextTotal;
+		}
+
+		if (isRouterMode()) {
+			const modelContextSize = selectedModelContextSize();
+			if (modelContextSize && modelContextSize > 0) {
+				return modelContextSize;
+			}
+		}
+
+		const propsContextSize = contextSize();
+		if (propsContextSize && propsContextSize > 0) {
+			return propsContextSize;
+		}
+
+		return DEFAULT_CONTEXT;
+	}
+
+	private parseTimingData(timingData: Record<string, unknown>): ApiProcessingState | null {
+		const promptTokens = (timingData.prompt_n as number) || 0;
+		const promptMs = (timingData.prompt_ms as number) || undefined;
+		const predictedTokens = (timingData.predicted_n as number) || 0;
+		const tokensPerSecond = (timingData.predicted_per_second as number) || 0;
+		const cacheTokens = (timingData.cache_n as number) || 0;
+		const promptProgress = timingData.prompt_progress as
+			| {
+					total: number;
+					cache: number;
+					processed: number;
+					time_ms: number;
+			  }
+			| undefined;
+
+		const contextTotal = this.getContextTotal();
+		const currentConfig = config();
+		const outputTokensMax = currentConfig.max_tokens || -1;
+
+		// Note: for timings data, the n_prompt does NOT include cache tokens
+		const contextUsed = promptTokens + cacheTokens + predictedTokens;
+		const outputTokensUsed = predictedTokens;
+
+		// Note: for prompt progress, the "processed" DOES include cache tokens
+		// we need to exclude them to get the real prompt tokens processed count
+		const progressCache = promptProgress?.cache || 0;
+		const progressActualDone = (promptProgress?.processed ?? 0) - progressCache;
+		const progressActualTotal = (promptProgress?.total ?? 0) - progressCache;
+		const progressPercent = promptProgress
+			? Math.round((progressActualDone / progressActualTotal) * 100)
+			: undefined;
+
+		return {
+			status: predictedTokens > 0 ? 'generating' : promptProgress ? 'preparing' : 'idle',
+			tokensDecoded: predictedTokens,
+			tokensRemaining: outputTokensMax - predictedTokens,
+			contextUsed,
+			contextTotal,
+			outputTokensUsed,
+			outputTokensMax,
+			hasNextToken: predictedTokens > 0,
+			tokensPerSecond,
+			temperature: currentConfig.temperature ?? 0.8,
+			topP: currentConfig.top_p ?? 0.95,
+			speculative: false,
+			progressPercent,
+			promptProgress,
+			promptTokens,
+			promptMs,
+			cacheTokens
+		};
+	}
+
+	/**
+	 * Gets the model used in a conversation based on the latest assistant message.
+	 * Returns the model from the most recent assistant message that has a model field set.
+	 *
+	 * @param messages - Array of messages to search through
+	 * @returns The model name or null if no model found
+	 */
+	getConversationModel(messages: DatabaseMessage[]): string | null {
+		// Search backwards through messages to find most recent assistant message with model
+		for (let i = messages.length - 1; i >= 0; i--) {
+			const message = messages[i];
+			if (message.role === 'assistant' && message.model) {
+				return message.model;
+			}
+		}
+		return null;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Error Handling
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	private isAbortError(error: unknown): boolean {
+		return error instanceof Error && (error.name === 'AbortError' || error instanceof DOMException);
+	}
+
+	private showErrorDialog(
+		type: 'timeout' | 'server',
+		message: string,
+		contextInfo?: { n_prompt_tokens: number; n_ctx: number }
+	): void {
+		this.errorDialogState = { type, message, contextInfo };
+	}
+
+	dismissErrorDialog(): void {
+		this.errorDialogState = null;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Message Operations
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Finds a message by ID and optionally validates its role.
+	 * Returns message and index, or null if not found or role doesn't match.
+	 */
+	private getMessageByIdWithRole(
+		messageId: string,
+		expectedRole?: ChatRole
+	): { message: DatabaseMessage; index: number } | null {
+		const index = conversationsStore.findMessageIndex(messageId);
+		if (index === -1) return null;
+
+		const message = conversationsStore.activeMessages[index];
+		if (expectedRole && message.role !== expectedRole) return null;
+
+		return { message, index };
+	}
+
+	async addMessage(
+		role: ChatRole,
+		content: string,
+		type: ChatMessageType = 'text',
+		parent: string = '-1',
+		extras?: DatabaseMessageExtra[]
+	): Promise<DatabaseMessage | null> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv) {
+			console.error('No active conversation when trying to add message');
+			return null;
+		}
+
+		try {
+			let parentId: string | null = null;
+
+			if (parent === '-1') {
+				const activeMessages = conversationsStore.activeMessages;
+				if (activeMessages.length > 0) {
+					parentId = activeMessages[activeMessages.length - 1].id;
+				} else {
+					const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+					const rootMessage = allMessages.find((m) => m.parent === null && m.type === 'root');
+					if (!rootMessage) {
+						parentId = await DatabaseService.createRootMessage(activeConv.id);
+					} else {
+						parentId = rootMessage.id;
+					}
+				}
+			} else {
+				parentId = parent;
+			}
+
+			const message = await DatabaseService.createMessageBranch(
+				{
+					convId: activeConv.id,
+					role,
+					content,
+					type,
+					timestamp: Date.now(),
+					thinking: '',
+					toolCalls: '',
+					children: [],
+					extra: extras
+				},
+				parentId
+			);
+
+			conversationsStore.addMessageToActive(message);
+			await conversationsStore.updateCurrentNode(message.id);
+			conversationsStore.updateConversationTimestamp();
+
+			return message;
+		} catch (error) {
+			console.error('Failed to add message:', error);
+			return null;
+		}
+	}
+
+	private async createAssistantMessage(parentId?: string): Promise<DatabaseMessage | null> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv) return null;
+
+		return await DatabaseService.createMessageBranch(
+			{
+				convId: activeConv.id,
+				type: 'text',
+				role: 'assistant',
+				content: '',
+				timestamp: Date.now(),
+				thinking: '',
+				toolCalls: '',
+				children: [],
+				model: null
+			},
+			parentId || null
+		);
+	}
+
+	private async streamChatCompletion(
+		allMessages: DatabaseMessage[],
+		assistantMessage: DatabaseMessage,
+		onComplete?: (content: string) => Promise<void>,
+		onError?: (error: Error) => void,
+		modelOverride?: string | null
+	): Promise<void> {
+		// Ensure model props are cached before streaming (for correct n_ctx in processing info)
+		if (isRouterMode()) {
+			const modelName = modelOverride || selectedModelName();
+			if (modelName && !modelsStore.getModelProps(modelName)) {
+				await modelsStore.fetchModelProps(modelName);
+			}
+		}
+
+		let streamedContent = '';
+		let streamedReasoningContent = '';
+		let streamedToolCallContent = '';
+		let resolvedModel: string | null = null;
+		let modelPersisted = false;
+
+		const recordModel = (modelName: string | null | undefined, persistImmediately = true): void => {
+			if (!modelName) return;
+			const normalizedModel = normalizeModelName(modelName);
+			if (!normalizedModel || normalizedModel === resolvedModel) return;
+			resolvedModel = normalizedModel;
+			const messageIndex = conversationsStore.findMessageIndex(assistantMessage.id);
+			conversationsStore.updateMessageAtIndex(messageIndex, { model: normalizedModel });
+			if (persistImmediately && !modelPersisted) {
+				modelPersisted = true;
+				DatabaseService.updateMessage(assistantMessage.id, { model: normalizedModel }).catch(() => {
+					modelPersisted = false;
+					resolvedModel = null;
+				});
+			}
+		};
+
+		this.startStreaming();
+		this.setActiveProcessingConversation(assistantMessage.convId);
+
+		const abortController = this.getOrCreateAbortController(assistantMessage.convId);
+
+		await ChatService.sendMessage(
+			allMessages,
+			{
+				...this.getApiOptions(),
+				...(modelOverride ? { model: modelOverride } : {}),
+				onChunk: (chunk: string) => {
+					streamedContent += chunk;
+					this.setChatStreaming(assistantMessage.convId, streamedContent, assistantMessage.id);
+					const idx = conversationsStore.findMessageIndex(assistantMessage.id);
+					conversationsStore.updateMessageAtIndex(idx, { content: streamedContent });
+				},
+				onReasoningChunk: (reasoningChunk: string) => {
+					streamedReasoningContent += reasoningChunk;
+					const idx = conversationsStore.findMessageIndex(assistantMessage.id);
+					conversationsStore.updateMessageAtIndex(idx, { thinking: streamedReasoningContent });
+				},
+				onToolCallChunk: (toolCallChunk: string) => {
+					const chunk = toolCallChunk.trim();
+					if (!chunk) return;
+					streamedToolCallContent = chunk;
+					const idx = conversationsStore.findMessageIndex(assistantMessage.id);
+					conversationsStore.updateMessageAtIndex(idx, { toolCalls: streamedToolCallContent });
+				},
+				onModel: (modelName: string) => recordModel(modelName),
+				onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
+					const tokensPerSecond =
+						timings?.predicted_ms && timings?.predicted_n
+							? (timings.predicted_n / timings.predicted_ms) * 1000
+							: 0;
+					this.updateProcessingStateFromTimings(
+						{
+							prompt_n: timings?.prompt_n || 0,
+							prompt_ms: timings?.prompt_ms,
+							predicted_n: timings?.predicted_n || 0,
+							predicted_per_second: tokensPerSecond,
+							cache_n: timings?.cache_n || 0,
+							prompt_progress: promptProgress
+						},
+						assistantMessage.convId
+					);
+				},
+				onComplete: async (
+					finalContent?: string,
+					reasoningContent?: string,
+					timings?: ChatMessageTimings,
+					toolCallContent?: string
+				) => {
+					this.stopStreaming();
+
+					const updateData: Record<string, unknown> = {
+						content: finalContent || streamedContent,
+						thinking: reasoningContent || streamedReasoningContent,
+						toolCalls: toolCallContent || streamedToolCallContent,
+						timings
+					};
+					if (resolvedModel && !modelPersisted) {
+						updateData.model = resolvedModel;
+					}
+					await DatabaseService.updateMessage(assistantMessage.id, updateData);
+
+					const idx = conversationsStore.findMessageIndex(assistantMessage.id);
+					const uiUpdate: Partial<DatabaseMessage> = {
+						content: updateData.content as string,
+						toolCalls: updateData.toolCalls as string
+					};
+					if (timings) uiUpdate.timings = timings;
+					if (resolvedModel) uiUpdate.model = resolvedModel;
+
+					conversationsStore.updateMessageAtIndex(idx, uiUpdate);
+					await conversationsStore.updateCurrentNode(assistantMessage.id);
+
+					if (onComplete) await onComplete(streamedContent);
+					this.setChatLoading(assistantMessage.convId, false);
+					this.clearChatStreaming(assistantMessage.convId);
+					this.clearProcessingState(assistantMessage.convId);
+
+					if (isRouterMode()) {
+						modelsStore.fetchRouterModels().catch(console.error);
+					}
+				},
+				onError: (error: Error) => {
+					this.stopStreaming();
+
+					if (this.isAbortError(error)) {
+						this.setChatLoading(assistantMessage.convId, false);
+						this.clearChatStreaming(assistantMessage.convId);
+						this.clearProcessingState(assistantMessage.convId);
+
+						return;
+					}
+
+					console.error('Streaming error:', error);
+
+					this.setChatLoading(assistantMessage.convId, false);
+					this.clearChatStreaming(assistantMessage.convId);
+					this.clearProcessingState(assistantMessage.convId);
+
+					const idx = conversationsStore.findMessageIndex(assistantMessage.id);
+
+					if (idx !== -1) {
+						const failedMessage = conversationsStore.removeMessageAtIndex(idx);
+						if (failedMessage) DatabaseService.deleteMessage(failedMessage.id).catch(console.error);
+					}
+
+					const contextInfo = (
+						error as Error & { contextInfo?: { n_prompt_tokens: number; n_ctx: number } }
+					).contextInfo;
+
+					this.showErrorDialog(
+						error.name === 'TimeoutError' ? 'timeout' : 'server',
+						error.message,
+						contextInfo
+					);
+
+					if (onError) onError(error);
+				}
+			},
+			assistantMessage.convId,
+			abortController.signal
+		);
+	}
+
+	async sendMessage(content: string, extras?: DatabaseMessageExtra[]): Promise<void> {
+		if (!content.trim() && (!extras || extras.length === 0)) return;
+		const activeConv = conversationsStore.activeConversation;
+		if (activeConv && this.isChatLoading(activeConv.id)) return;
+
+		let isNewConversation = false;
+		if (!activeConv) {
+			await conversationsStore.createConversation();
+			isNewConversation = true;
+		}
+		const currentConv = conversationsStore.activeConversation;
+		if (!currentConv) return;
+
+		this.errorDialogState = null;
+		this.setChatLoading(currentConv.id, true);
+		this.clearChatStreaming(currentConv.id);
+
+		try {
+			if (isNewConversation) {
+				const rootId = await DatabaseService.createRootMessage(currentConv.id);
+				const currentConfig = config();
+				const systemPrompt = currentConfig.systemMessage?.toString().trim();
+
+				if (systemPrompt) {
+					const systemMessage = await DatabaseService.createSystemMessage(
+						currentConv.id,
+						systemPrompt,
+						rootId
+					);
+
+					conversationsStore.addMessageToActive(systemMessage);
+				}
+			}
+
+			const userMessage = await this.addMessage('user', content, 'text', '-1', extras);
+			if (!userMessage) throw new Error('Failed to add user message');
+			if (isNewConversation && content)
+				await conversationsStore.updateConversationName(currentConv.id, content.trim());
+
+			const assistantMessage = await this.createAssistantMessage(userMessage.id);
+
+			if (!assistantMessage) throw new Error('Failed to create assistant message');
+
+			conversationsStore.addMessageToActive(assistantMessage);
+			await this.streamChatCompletion(
+				conversationsStore.activeMessages.slice(0, -1),
+				assistantMessage
+			);
+		} catch (error) {
+			if (this.isAbortError(error)) {
+				this.setChatLoading(currentConv.id, false);
+				return;
+			}
+			console.error('Failed to send message:', error);
+			this.setChatLoading(currentConv.id, false);
+			if (!this.errorDialogState) {
+				const dialogType =
+					error instanceof Error && error.name === 'TimeoutError' ? 'timeout' : 'server';
+				const contextInfo = (
+					error as Error & { contextInfo?: { n_prompt_tokens: number; n_ctx: number } }
+				).contextInfo;
+
+				this.showErrorDialog(
+					dialogType,
+					error instanceof Error ? error.message : 'Unknown error',
+					contextInfo
+				);
+			}
+		}
+	}
+
+	async stopGeneration(): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+
+		if (!activeConv) return;
+
+		await this.stopGenerationForChat(activeConv.id);
+	}
+
+	async stopGenerationForChat(convId: string): Promise<void> {
+		await this.savePartialResponseIfNeeded(convId);
+
+		this.stopStreaming();
+		this.abortRequest(convId);
+		this.setChatLoading(convId, false);
+		this.clearChatStreaming(convId);
+		this.clearProcessingState(convId);
+	}
+
+	/**
+	 * Gets or creates an AbortController for a conversation
+	 */
+	private getOrCreateAbortController(convId: string): AbortController {
+		let controller = this.abortControllers.get(convId);
+		if (!controller || controller.signal.aborted) {
+			controller = new AbortController();
+			this.abortControllers.set(convId, controller);
+		}
+		return controller;
+	}
+
+	/**
+	 * Aborts any ongoing request for a conversation
+	 */
+	private abortRequest(convId?: string): void {
+		if (convId) {
+			const controller = this.abortControllers.get(convId);
+			if (controller) {
+				controller.abort();
+				this.abortControllers.delete(convId);
+			}
+		} else {
+			for (const controller of this.abortControllers.values()) {
+				controller.abort();
+			}
+			this.abortControllers.clear();
+		}
+	}
+
+	private async savePartialResponseIfNeeded(convId?: string): Promise<void> {
+		const conversationId = convId || conversationsStore.activeConversation?.id;
+
+		if (!conversationId) return;
+
+		const streamingState = this.chatStreamingStates.get(conversationId);
+
+		if (!streamingState || !streamingState.response.trim()) return;
+
+		const messages =
+			conversationId === conversationsStore.activeConversation?.id
+				? conversationsStore.activeMessages
+				: await conversationsStore.getConversationMessages(conversationId);
+
+		if (!messages.length) return;
+
+		const lastMessage = messages[messages.length - 1];
+
+		if (lastMessage?.role === 'assistant') {
+			try {
+				const updateData: { content: string; thinking?: string; timings?: ChatMessageTimings } = {
+					content: streamingState.response
+				};
+				if (lastMessage.thinking?.trim()) updateData.thinking = lastMessage.thinking;
+				const lastKnownState = this.getProcessingState(conversationId);
+				if (lastKnownState) {
+					updateData.timings = {
+						prompt_n: lastKnownState.promptTokens || 0,
+						prompt_ms: lastKnownState.promptMs,
+						predicted_n: lastKnownState.tokensDecoded || 0,
+						cache_n: lastKnownState.cacheTokens || 0,
+						predicted_ms:
+							lastKnownState.tokensPerSecond && lastKnownState.tokensDecoded
+								? (lastKnownState.tokensDecoded / lastKnownState.tokensPerSecond) * 1000
+								: undefined
+					};
+				}
+
+				await DatabaseService.updateMessage(lastMessage.id, updateData);
+
+				lastMessage.content = this.currentResponse;
+
+				if (updateData.thinking) lastMessage.thinking = updateData.thinking;
+
+				if (updateData.timings) lastMessage.timings = updateData.timings;
+			} catch (error) {
+				lastMessage.content = this.currentResponse;
+				console.error('Failed to save partial response:', error);
+			}
+		}
+	}
+
+	async updateMessage(messageId: string, newContent: string): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv) return;
+		if (this.isLoading) this.stopGeneration();
+
+		const result = this.getMessageByIdWithRole(messageId, 'user');
+		if (!result) return;
+		const { message: messageToUpdate, index: messageIndex } = result;
+		const originalContent = messageToUpdate.content;
+
+		try {
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
+			const isFirstUserMessage = rootMessage && messageToUpdate.parent === rootMessage.id;
+
+			conversationsStore.updateMessageAtIndex(messageIndex, { content: newContent });
+			await DatabaseService.updateMessage(messageId, { content: newContent });
+
+			if (isFirstUserMessage && newContent.trim()) {
+				await conversationsStore.updateConversationTitleWithConfirmation(
+					activeConv.id,
+					newContent.trim(),
+					conversationsStore.titleUpdateConfirmationCallback
+				);
+			}
+
+			const messagesToRemove = conversationsStore.activeMessages.slice(messageIndex + 1);
+
+			for (const message of messagesToRemove) await DatabaseService.deleteMessage(message.id);
+
+			conversationsStore.sliceActiveMessages(messageIndex + 1);
+			conversationsStore.updateConversationTimestamp();
+
+			this.setChatLoading(activeConv.id, true);
+			this.clearChatStreaming(activeConv.id);
+
+			const assistantMessage = await this.createAssistantMessage();
+
+			if (!assistantMessage) throw new Error('Failed to create assistant message');
+
+			conversationsStore.addMessageToActive(assistantMessage);
+
+			await conversationsStore.updateCurrentNode(assistantMessage.id);
+			await this.streamChatCompletion(
+				conversationsStore.activeMessages.slice(0, -1),
+				assistantMessage,
+				undefined,
+				() => {
+					conversationsStore.updateMessageAtIndex(conversationsStore.findMessageIndex(messageId), {
+						content: originalContent
+					});
+				}
+			);
+		} catch (error) {
+			if (!this.isAbortError(error)) console.error('Failed to update message:', error);
+		}
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Regeneration
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	async regenerateMessage(messageId: string): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv || this.isLoading) return;
+
+		const result = this.getMessageByIdWithRole(messageId, 'assistant');
+		if (!result) return;
+		const { index: messageIndex } = result;
+
+		try {
+			const messagesToRemove = conversationsStore.activeMessages.slice(messageIndex);
+			for (const message of messagesToRemove) await DatabaseService.deleteMessage(message.id);
+			conversationsStore.sliceActiveMessages(messageIndex);
+			conversationsStore.updateConversationTimestamp();
+
+			this.setChatLoading(activeConv.id, true);
+			this.clearChatStreaming(activeConv.id);
+
+			const parentMessageId =
+				conversationsStore.activeMessages.length > 0
+					? conversationsStore.activeMessages[conversationsStore.activeMessages.length - 1].id
+					: undefined;
+			const assistantMessage = await this.createAssistantMessage(parentMessageId);
+			if (!assistantMessage) throw new Error('Failed to create assistant message');
+			conversationsStore.addMessageToActive(assistantMessage);
+			await this.streamChatCompletion(
+				conversationsStore.activeMessages.slice(0, -1),
+				assistantMessage
+			);
+		} catch (error) {
+			if (!this.isAbortError(error)) console.error('Failed to regenerate message:', error);
+			this.setChatLoading(activeConv?.id || '', false);
+		}
+	}
+
+	async getDeletionInfo(messageId: string): Promise<{
+		totalCount: number;
+		userMessages: number;
+		assistantMessages: number;
+		messageTypes: string[];
+	}> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv)
+			return { totalCount: 0, userMessages: 0, assistantMessages: 0, messageTypes: [] };
+		const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+		const descendants = findDescendantMessages(allMessages, messageId);
+		const allToDelete = [messageId, ...descendants];
+		const messagesToDelete = allMessages.filter((m) => allToDelete.includes(m.id));
+		let userMessages = 0,
+			assistantMessages = 0;
+		const messageTypes: string[] = [];
+		for (const msg of messagesToDelete) {
+			if (msg.role === 'user') {
+				userMessages++;
+				if (!messageTypes.includes('user message')) messageTypes.push('user message');
+			} else if (msg.role === 'assistant') {
+				assistantMessages++;
+				if (!messageTypes.includes('assistant response')) messageTypes.push('assistant response');
+			}
+		}
+		return { totalCount: allToDelete.length, userMessages, assistantMessages, messageTypes };
+	}
+
+	async deleteMessage(messageId: string): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv) return;
+		try {
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const messageToDelete = allMessages.find((m) => m.id === messageId);
+			if (!messageToDelete) return;
+
+			const currentPath = filterByLeafNodeId(allMessages, activeConv.currNode || '', false);
+			const isInCurrentPath = currentPath.some((m) => m.id === messageId);
+
+			if (isInCurrentPath && messageToDelete.parent) {
+				const siblings = allMessages.filter(
+					(m) => m.parent === messageToDelete.parent && m.id !== messageId
+				);
+
+				if (siblings.length > 0) {
+					const latestSibling = siblings.reduce((latest, sibling) =>
+						sibling.timestamp > latest.timestamp ? sibling : latest
+					);
+					await conversationsStore.updateCurrentNode(findLeafNode(allMessages, latestSibling.id));
+				} else if (messageToDelete.parent) {
+					await conversationsStore.updateCurrentNode(
+						findLeafNode(allMessages, messageToDelete.parent)
+					);
+				}
+			}
+			await DatabaseService.deleteMessageCascading(activeConv.id, messageId);
+			await conversationsStore.refreshActiveMessages();
+
+			conversationsStore.updateConversationTimestamp();
+		} catch (error) {
+			console.error('Failed to delete message:', error);
+		}
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Editing
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	clearEditMode(): void {
+		this.isEditModeActive = false;
+		this.addFilesHandler = null;
+	}
+
+	async continueAssistantMessage(messageId: string): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv || this.isLoading) return;
+
+		const result = this.getMessageByIdWithRole(messageId, 'assistant');
+		if (!result) return;
+		const { message: msg, index: idx } = result;
+
+		if (this.isChatLoading(activeConv.id)) return;
+
+		try {
+			this.errorDialogState = null;
+			this.setChatLoading(activeConv.id, true);
+			this.clearChatStreaming(activeConv.id);
+
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const dbMessage = allMessages.find((m) => m.id === messageId);
+
+			if (!dbMessage) {
+				this.setChatLoading(activeConv.id, false);
+
+				return;
+			}
+
+			const originalContent = dbMessage.content;
+			const originalThinking = dbMessage.thinking || '';
+
+			const conversationContext = conversationsStore.activeMessages.slice(0, idx);
+			const contextWithContinue = [
+				...conversationContext,
+				{ role: 'assistant' as const, content: originalContent }
+			];
+
+			let appendedContent = '',
+				appendedThinking = '',
+				hasReceivedContent = false;
+
+			const abortController = this.getOrCreateAbortController(msg.convId);
+
+			await ChatService.sendMessage(
+				contextWithContinue,
+				{
+					...this.getApiOptions(),
+
+					onChunk: (chunk: string) => {
+						hasReceivedContent = true;
+						appendedContent += chunk;
+						const fullContent = originalContent + appendedContent;
+						this.setChatStreaming(msg.convId, fullContent, msg.id);
+						conversationsStore.updateMessageAtIndex(idx, { content: fullContent });
+					},
+
+					onReasoningChunk: (reasoningChunk: string) => {
+						hasReceivedContent = true;
+						appendedThinking += reasoningChunk;
+						conversationsStore.updateMessageAtIndex(idx, {
+							thinking: originalThinking + appendedThinking
+						});
+					},
+
+					onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
+						const tokensPerSecond =
+							timings?.predicted_ms && timings?.predicted_n
+								? (timings.predicted_n / timings.predicted_ms) * 1000
+								: 0;
+						this.updateProcessingStateFromTimings(
+							{
+								prompt_n: timings?.prompt_n || 0,
+								prompt_ms: timings?.prompt_ms,
+								predicted_n: timings?.predicted_n || 0,
+								predicted_per_second: tokensPerSecond,
+								cache_n: timings?.cache_n || 0,
+								prompt_progress: promptProgress
+							},
+							msg.convId
+						);
+					},
+
+					onComplete: async (
+						finalContent?: string,
+						reasoningContent?: string,
+						timings?: ChatMessageTimings
+					) => {
+						const fullContent = originalContent + (finalContent || appendedContent);
+						const fullThinking = originalThinking + (reasoningContent || appendedThinking);
+						await DatabaseService.updateMessage(msg.id, {
+							content: fullContent,
+							thinking: fullThinking,
+							timestamp: Date.now(),
+							timings
+						});
+						conversationsStore.updateMessageAtIndex(idx, {
+							content: fullContent,
+							thinking: fullThinking,
+							timestamp: Date.now(),
+							timings
+						});
+						conversationsStore.updateConversationTimestamp();
+						this.setChatLoading(msg.convId, false);
+						this.clearChatStreaming(msg.convId);
+						this.clearProcessingState(msg.convId);
+					},
+
+					onError: async (error: Error) => {
+						if (this.isAbortError(error)) {
+							if (hasReceivedContent && appendedContent) {
+								await DatabaseService.updateMessage(msg.id, {
+									content: originalContent + appendedContent,
+									thinking: originalThinking + appendedThinking,
+									timestamp: Date.now()
+								});
+								conversationsStore.updateMessageAtIndex(idx, {
+									content: originalContent + appendedContent,
+									thinking: originalThinking + appendedThinking,
+									timestamp: Date.now()
+								});
+							}
+							this.setChatLoading(msg.convId, false);
+							this.clearChatStreaming(msg.convId);
+							this.clearProcessingState(msg.convId);
+							return;
+						}
+						console.error('Continue generation error:', error);
+						conversationsStore.updateMessageAtIndex(idx, {
+							content: originalContent,
+							thinking: originalThinking
+						});
+						await DatabaseService.updateMessage(msg.id, {
+							content: originalContent,
+							thinking: originalThinking
+						});
+						this.setChatLoading(msg.convId, false);
+						this.clearChatStreaming(msg.convId);
+						this.clearProcessingState(msg.convId);
+						this.showErrorDialog(
+							error.name === 'TimeoutError' ? 'timeout' : 'server',
+							error.message
+						);
+					}
+				},
+				msg.convId,
+				abortController.signal
+			);
+		} catch (error) {
+			if (!this.isAbortError(error)) console.error('Failed to continue message:', error);
+			if (activeConv) this.setChatLoading(activeConv.id, false);
+		}
+	}
+
+	async editAssistantMessage(
+		messageId: string,
+		newContent: string,
+		shouldBranch: boolean
+	): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv || this.isLoading) return;
+
+		const result = this.getMessageByIdWithRole(messageId, 'assistant');
+		if (!result) return;
+		const { message: msg, index: idx } = result;
+
+		try {
+			if (shouldBranch) {
+				const newMessage = await DatabaseService.createMessageBranch(
+					{
+						convId: msg.convId,
+						type: msg.type,
+						timestamp: Date.now(),
+						role: msg.role,
+						content: newContent,
+						thinking: msg.thinking || '',
+						toolCalls: msg.toolCalls || '',
+						children: [],
+						model: msg.model
+					},
+					msg.parent!
+				);
+				await conversationsStore.updateCurrentNode(newMessage.id);
+			} else {
+				await DatabaseService.updateMessage(msg.id, { content: newContent });
+				await conversationsStore.updateCurrentNode(msg.id);
+				conversationsStore.updateMessageAtIndex(idx, {
+					content: newContent
+				});
+			}
+			conversationsStore.updateConversationTimestamp();
+			await conversationsStore.refreshActiveMessages();
+		} catch (error) {
+			console.error('Failed to edit assistant message:', error);
+		}
+	}
+
+	async editUserMessagePreserveResponses(
+		messageId: string,
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
+	): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv) return;
+
+		const result = this.getMessageByIdWithRole(messageId, 'user');
+		if (!result) return;
+		const { message: msg, index: idx } = result;
+
+		try {
+			const updateData: Partial<DatabaseMessage> = {
+				content: newContent
+			};
+
+			// Update extras if provided (including empty array to clear attachments)
+			// Deep clone to avoid Proxy objects from Svelte reactivity
+			if (newExtras !== undefined) {
+				updateData.extra = JSON.parse(JSON.stringify(newExtras));
+			}
+
+			await DatabaseService.updateMessage(messageId, updateData);
+			conversationsStore.updateMessageAtIndex(idx, updateData);
+
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
+
+			if (rootMessage && msg.parent === rootMessage.id && newContent.trim()) {
+				await conversationsStore.updateConversationTitleWithConfirmation(
+					activeConv.id,
+					newContent.trim(),
+					conversationsStore.titleUpdateConfirmationCallback
+				);
+			}
+			conversationsStore.updateConversationTimestamp();
+		} catch (error) {
+			console.error('Failed to edit user message:', error);
+		}
+	}
+
+	async editMessageWithBranching(
+		messageId: string,
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
+	): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv || this.isLoading) return;
+
+		let result = this.getMessageByIdWithRole(messageId, 'user');
+
+		if (!result) {
+			result = this.getMessageByIdWithRole(messageId, 'system');
+		}
+
+		if (!result) return;
+		const { message: msg } = result;
+
+		try {
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
+			const isFirstUserMessage =
+				msg.role === 'user' && rootMessage && msg.parent === rootMessage.id;
+
+			const parentId = msg.parent || rootMessage?.id;
+			if (!parentId) return;
+
+			// Use newExtras if provided, otherwise copy existing extras
+			// Deep clone to avoid Proxy objects from Svelte reactivity
+			const extrasToUse =
+				newExtras !== undefined
+					? JSON.parse(JSON.stringify(newExtras))
+					: msg.extra
+						? JSON.parse(JSON.stringify(msg.extra))
+						: undefined;
+
+			const newMessage = await DatabaseService.createMessageBranch(
+				{
+					convId: msg.convId,
+					type: msg.type,
+					timestamp: Date.now(),
+					role: msg.role,
+					content: newContent,
+					thinking: msg.thinking || '',
+					toolCalls: msg.toolCalls || '',
+					children: [],
+					extra: extrasToUse,
+					model: msg.model
+				},
+				parentId
+			);
+			await conversationsStore.updateCurrentNode(newMessage.id);
+			conversationsStore.updateConversationTimestamp();
+
+			if (isFirstUserMessage && newContent.trim()) {
+				await conversationsStore.updateConversationTitleWithConfirmation(
+					activeConv.id,
+					newContent.trim(),
+					conversationsStore.titleUpdateConfirmationCallback
+				);
+			}
+			await conversationsStore.refreshActiveMessages();
+
+			if (msg.role === 'user') {
+				await this.generateResponseForMessage(newMessage.id);
+			}
+		} catch (error) {
+			console.error('Failed to edit message with branching:', error);
+		}
+	}
+
+	async regenerateMessageWithBranching(messageId: string, modelOverride?: string): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv || this.isLoading) return;
+		try {
+			const idx = conversationsStore.findMessageIndex(messageId);
+			if (idx === -1) return;
+			const msg = conversationsStore.activeMessages[idx];
+			if (msg.role !== 'assistant') return;
+
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const parentMessage = allMessages.find((m) => m.id === msg.parent);
+			if (!parentMessage) return;
+
+			this.setChatLoading(activeConv.id, true);
+			this.clearChatStreaming(activeConv.id);
+
+			const newAssistantMessage = await DatabaseService.createMessageBranch(
+				{
+					convId: activeConv.id,
+					type: 'text',
+					timestamp: Date.now(),
+					role: 'assistant',
+					content: '',
+					thinking: '',
+					toolCalls: '',
+					children: [],
+					model: null
+				},
+				parentMessage.id
+			);
+			await conversationsStore.updateCurrentNode(newAssistantMessage.id);
+			conversationsStore.updateConversationTimestamp();
+			await conversationsStore.refreshActiveMessages();
+
+			const conversationPath = filterByLeafNodeId(
+				allMessages,
+				parentMessage.id,
+				false
+			) as DatabaseMessage[];
+			// Use modelOverride if provided, otherwise use the original message's model
+			// If neither is available, don't pass model (will use global selection)
+			const modelToUse = modelOverride || msg.model || undefined;
+			await this.streamChatCompletion(
+				conversationPath,
+				newAssistantMessage,
+				undefined,
+				undefined,
+				modelToUse
+			);
+		} catch (error) {
+			if (!this.isAbortError(error))
+				console.error('Failed to regenerate message with branching:', error);
+			this.setChatLoading(activeConv?.id || '', false);
+		}
+	}
+
+	private async generateResponseForMessage(userMessageId: string): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+
+		if (!activeConv) return;
+
+		this.errorDialogState = null;
+		this.setChatLoading(activeConv.id, true);
+		this.clearChatStreaming(activeConv.id);
+
+		try {
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const conversationPath = filterByLeafNodeId(
+				allMessages,
+				userMessageId,
+				false
+			) as DatabaseMessage[];
+			const assistantMessage = await DatabaseService.createMessageBranch(
+				{
+					convId: activeConv.id,
+					type: 'text',
+					timestamp: Date.now(),
+					role: 'assistant',
+					content: '',
+					thinking: '',
+					toolCalls: '',
+					children: [],
+					model: null
+				},
+				userMessageId
+			);
+			conversationsStore.addMessageToActive(assistantMessage);
+			await this.streamChatCompletion(conversationPath, assistantMessage);
+		} catch (error) {
+			console.error('Failed to generate response:', error);
+			this.setChatLoading(activeConv.id, false);
+		}
+	}
+
+	getAddFilesHandler(): ((files: File[]) => void) | null {
+		return this.addFilesHandler;
+	}
+
+	public getAllLoadingChats(): string[] {
+		return Array.from(this.chatLoadingStates.keys());
+	}
+
+	public getAllStreamingChats(): string[] {
+		return Array.from(this.chatStreamingStates.keys());
+	}
+
+	public getChatStreamingPublic(
+		convId: string
+	): { response: string; messageId: string } | undefined {
+		return this.getChatStreaming(convId);
+	}
+
+	public isChatLoadingPublic(convId: string): boolean {
+		return this.isChatLoading(convId);
+	}
+
+	isEditing(): boolean {
+		return this.isEditModeActive;
+	}
+
+	setEditModeActive(handler: (files: File[]) => void): void {
+		this.isEditModeActive = true;
+		this.addFilesHandler = handler;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Utilities
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	private getApiOptions(): Record<string, unknown> {
+		const currentConfig = config();
+		const hasValue = (value: unknown): boolean =>
+			value !== undefined && value !== null && value !== '';
+
+		const apiOptions: Record<string, unknown> = { stream: true, timings_per_token: true };
+
+		// Model selection (required in ROUTER mode)
+		if (isRouterMode()) {
+			const modelName = selectedModelName();
+			if (modelName) apiOptions.model = modelName;
+		}
+
+		// Config options needed by ChatService
+		if (currentConfig.systemMessage) apiOptions.systemMessage = currentConfig.systemMessage;
+		if (currentConfig.disableReasoningFormat) apiOptions.disableReasoningFormat = true;
+
+		if (hasValue(currentConfig.temperature))
+			apiOptions.temperature = Number(currentConfig.temperature);
+		if (hasValue(currentConfig.max_tokens))
+			apiOptions.max_tokens = Number(currentConfig.max_tokens);
+		if (hasValue(currentConfig.dynatemp_range))
+			apiOptions.dynatemp_range = Number(currentConfig.dynatemp_range);
+		if (hasValue(currentConfig.dynatemp_exponent))
+			apiOptions.dynatemp_exponent = Number(currentConfig.dynatemp_exponent);
+		if (hasValue(currentConfig.top_k)) apiOptions.top_k = Number(currentConfig.top_k);
+		if (hasValue(currentConfig.top_p)) apiOptions.top_p = Number(currentConfig.top_p);
+		if (hasValue(currentConfig.min_p)) apiOptions.min_p = Number(currentConfig.min_p);
+		if (hasValue(currentConfig.xtc_probability))
+			apiOptions.xtc_probability = Number(currentConfig.xtc_probability);
+		if (hasValue(currentConfig.xtc_threshold))
+			apiOptions.xtc_threshold = Number(currentConfig.xtc_threshold);
+		if (hasValue(currentConfig.typ_p)) apiOptions.typ_p = Number(currentConfig.typ_p);
+		if (hasValue(currentConfig.repeat_last_n))
+			apiOptions.repeat_last_n = Number(currentConfig.repeat_last_n);
+		if (hasValue(currentConfig.repeat_penalty))
+			apiOptions.repeat_penalty = Number(currentConfig.repeat_penalty);
+		if (hasValue(currentConfig.presence_penalty))
+			apiOptions.presence_penalty = Number(currentConfig.presence_penalty);
+		if (hasValue(currentConfig.frequency_penalty))
+			apiOptions.frequency_penalty = Number(currentConfig.frequency_penalty);
+		if (hasValue(currentConfig.dry_multiplier))
+			apiOptions.dry_multiplier = Number(currentConfig.dry_multiplier);
+		if (hasValue(currentConfig.dry_base)) apiOptions.dry_base = Number(currentConfig.dry_base);
+		if (hasValue(currentConfig.dry_allowed_length))
+			apiOptions.dry_allowed_length = Number(currentConfig.dry_allowed_length);
+		if (hasValue(currentConfig.dry_penalty_last_n))
+			apiOptions.dry_penalty_last_n = Number(currentConfig.dry_penalty_last_n);
+		if (currentConfig.samplers) apiOptions.samplers = currentConfig.samplers;
+		if (currentConfig.backend_sampling)
+			apiOptions.backend_sampling = currentConfig.backend_sampling;
+		if (currentConfig.custom) apiOptions.custom = currentConfig.custom;
+
+		return apiOptions;
+	}
+}
+
+export const chatStore = new ChatStore();
+
+export const activeProcessingState = () => chatStore.activeProcessingState;
+export const clearEditMode = () => chatStore.clearEditMode();
+export const currentResponse = () => chatStore.currentResponse;
+export const errorDialog = () => chatStore.errorDialogState;
+export const getAddFilesHandler = () => chatStore.getAddFilesHandler();
+export const getAllLoadingChats = () => chatStore.getAllLoadingChats();
+export const getAllStreamingChats = () => chatStore.getAllStreamingChats();
+export const getChatStreaming = (convId: string) => chatStore.getChatStreamingPublic(convId);
+export const isChatLoading = (convId: string) => chatStore.isChatLoadingPublic(convId);
+export const isChatStreaming = () => chatStore.isStreaming();
+export const isEditing = () => chatStore.isEditing();
+export const isLoading = () => chatStore.isLoading;
+export const setEditModeActive = (handler: (files: File[]) => void) =>
+	chatStore.setEditModeActive(handler);
diff --git a/llama.cpp/tools/server/webui/src/lib/stores/conversations.svelte.ts b/llama.cpp/tools/server/webui/src/lib/stores/conversations.svelte.ts
new file mode 100644
index 0000000..3300eb3
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/stores/conversations.svelte.ts
@@ -0,0 +1,662 @@
+import { browser } from '$app/environment';
+import { goto } from '$app/navigation';
+import { toast } from 'svelte-sonner';
+import { DatabaseService } from '$lib/services/database';
+import { config } from '$lib/stores/settings.svelte';
+import { filterByLeafNodeId, findLeafNode } from '$lib/utils';
+import { AttachmentType } from '$lib/enums';
+
+/**
+ * conversationsStore - Persistent conversation data and lifecycle management
+ *
+ * **Terminology - Chat vs Conversation:**
+ * - **Chat**: The active interaction space with the Chat Completions API. Represents the
+ *   real-time streaming session, loading states, and UI visualization of AI communication.
+ *   Managed by chatStore, a "chat" is ephemeral and exists during active AI interactions.
+ * - **Conversation**: The persistent database entity storing all messages and metadata.
+ *   A "conversation" survives across sessions, page reloads, and browser restarts.
+ *   It contains the complete message history, branching structure, and conversation metadata.
+ *
+ * This store manages all conversation-level data and operations including creation, loading,
+ * deletion, and navigation. It maintains the list of conversations and the currently active
+ * conversation with its message history, providing reactive state for UI components.
+ *
+ * **Architecture & Relationships:**
+ * - **conversationsStore** (this class): Persistent conversation data management
+ *   - Manages conversation list and active conversation state
+ *   - Handles conversation CRUD operations via DatabaseService
+ *   - Maintains active message array for current conversation
+ *   - Coordinates branching navigation (currNode tracking)
+ *
+ * - **chatStore**: Uses conversation data as context for active AI streaming
+ * - **DatabaseService**: Low-level IndexedDB storage for conversations and messages
+ *
+ * **Key Features:**
+ * - **Conversation Lifecycle**: Create, load, update, delete conversations
+ * - **Message Management**: Active message array with branching support
+ * - **Import/Export**: JSON-based conversation backup and restore
+ * - **Branch Navigation**: Navigate between message tree branches
+ * - **Title Management**: Auto-update titles with confirmation dialogs
+ * - **Reactive State**: Svelte 5 runes for automatic UI updates
+ *
+ * **State Properties:**
+ * - `conversations`: All conversations sorted by last modified
+ * - `activeConversation`: Currently viewed conversation
+ * - `activeMessages`: Messages in current conversation path
+ * - `isInitialized`: Store initialization status
+ */
+class ConversationsStore {
+	// ─────────────────────────────────────────────────────────────────────────────
+	// State
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/** List of all conversations */
+	conversations = $state<DatabaseConversation[]>([]);
+
+	/** Currently active conversation */
+	activeConversation = $state<DatabaseConversation | null>(null);
+
+	/** Messages in the active conversation (filtered by currNode path) */
+	activeMessages = $state<DatabaseMessage[]>([]);
+
+	/** Whether the store has been initialized */
+	isInitialized = $state(false);
+
+	/** Callback for title update confirmation dialog */
+	titleUpdateConfirmationCallback?: (currentTitle: string, newTitle: string) => Promise<boolean>;
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Modalities
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Modalities used in the active conversation.
+	 * Computed from attachments in activeMessages.
+	 * Used to filter available models - models must support all used modalities.
+	 */
+	usedModalities: ModelModalities = $derived.by(() => {
+		return this.calculateModalitiesFromMessages(this.activeMessages);
+	});
+
+	/**
+	 * Calculate modalities from a list of messages.
+	 * Helper method used by both usedModalities and getModalitiesUpToMessage.
+	 */
+	private calculateModalitiesFromMessages(messages: DatabaseMessage[]): ModelModalities {
+		const modalities: ModelModalities = { vision: false, audio: false };
+
+		for (const message of messages) {
+			if (!message.extra) continue;
+
+			for (const extra of message.extra) {
+				if (extra.type === AttachmentType.IMAGE) {
+					modalities.vision = true;
+				}
+
+				// PDF only requires vision if processed as images
+				if (extra.type === AttachmentType.PDF) {
+					const pdfExtra = extra as DatabaseMessageExtraPdfFile;
+
+					if (pdfExtra.processedAsImages) {
+						modalities.vision = true;
+					}
+				}
+
+				if (extra.type === AttachmentType.AUDIO) {
+					modalities.audio = true;
+				}
+			}
+
+			if (modalities.vision && modalities.audio) break;
+		}
+
+		return modalities;
+	}
+
+	/**
+	 * Get modalities used in messages BEFORE the specified message.
+	 * Used for regeneration - only consider context that was available when generating this message.
+	 */
+	getModalitiesUpToMessage(messageId: string): ModelModalities {
+		const messageIndex = this.activeMessages.findIndex((m) => m.id === messageId);
+
+		if (messageIndex === -1) {
+			return this.usedModalities;
+		}
+
+		const messagesBefore = this.activeMessages.slice(0, messageIndex);
+		return this.calculateModalitiesFromMessages(messagesBefore);
+	}
+
+	constructor() {
+		if (browser) {
+			this.initialize();
+		}
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Lifecycle
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Initializes the conversations store by loading conversations from the database
+	 */
+	async initialize(): Promise<void> {
+		try {
+			await this.loadConversations();
+			this.isInitialized = true;
+		} catch (error) {
+			console.error('Failed to initialize conversations store:', error);
+		}
+	}
+
+	/**
+	 * Loads all conversations from the database
+	 */
+	async loadConversations(): Promise<void> {
+		this.conversations = await DatabaseService.getAllConversations();
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Conversation CRUD
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Creates a new conversation and navigates to it
+	 * @param name - Optional name for the conversation
+	 * @returns The ID of the created conversation
+	 */
+	async createConversation(name?: string): Promise<string> {
+		const conversationName = name || `Chat ${new Date().toLocaleString()}`;
+		const conversation = await DatabaseService.createConversation(conversationName);
+
+		this.conversations.unshift(conversation);
+		this.activeConversation = conversation;
+		this.activeMessages = [];
+
+		await goto(`#/chat/${conversation.id}`);
+
+		return conversation.id;
+	}
+
+	/**
+	 * Loads a specific conversation and its messages
+	 * @param convId - The conversation ID to load
+	 * @returns True if conversation was loaded successfully
+	 */
+	async loadConversation(convId: string): Promise<boolean> {
+		try {
+			const conversation = await DatabaseService.getConversation(convId);
+
+			if (!conversation) {
+				return false;
+			}
+
+			this.activeConversation = conversation;
+
+			if (conversation.currNode) {
+				const allMessages = await DatabaseService.getConversationMessages(convId);
+				this.activeMessages = filterByLeafNodeId(
+					allMessages,
+					conversation.currNode,
+					false
+				) as DatabaseMessage[];
+			} else {
+				this.activeMessages = await DatabaseService.getConversationMessages(convId);
+			}
+
+			return true;
+		} catch (error) {
+			console.error('Failed to load conversation:', error);
+			return false;
+		}
+	}
+
+	/**
+	 * Clears the active conversation and messages
+	 * Used when navigating away from chat or starting fresh
+	 */
+	clearActiveConversation(): void {
+		this.activeConversation = null;
+		this.activeMessages = [];
+		// Active processing conversation is now managed by chatStore
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Message Management
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Refreshes active messages based on currNode after branch navigation
+	 */
+	async refreshActiveMessages(): Promise<void> {
+		if (!this.activeConversation) return;
+
+		const allMessages = await DatabaseService.getConversationMessages(this.activeConversation.id);
+
+		if (allMessages.length === 0) {
+			this.activeMessages = [];
+			return;
+		}
+
+		const leafNodeId =
+			this.activeConversation.currNode ||
+			allMessages.reduce((latest, msg) => (msg.timestamp > latest.timestamp ? msg : latest)).id;
+
+		const currentPath = filterByLeafNodeId(allMessages, leafNodeId, false) as DatabaseMessage[];
+
+		this.activeMessages.length = 0;
+		this.activeMessages.push(...currentPath);
+	}
+
+	/**
+	 * Updates the name of a conversation
+	 * @param convId - The conversation ID to update
+	 * @param name - The new name for the conversation
+	 */
+	async updateConversationName(convId: string, name: string): Promise<void> {
+		try {
+			await DatabaseService.updateConversation(convId, { name });
+
+			const convIndex = this.conversations.findIndex((c) => c.id === convId);
+
+			if (convIndex !== -1) {
+				this.conversations[convIndex].name = name;
+			}
+
+			if (this.activeConversation?.id === convId) {
+				this.activeConversation.name = name;
+			}
+		} catch (error) {
+			console.error('Failed to update conversation name:', error);
+		}
+	}
+
+	/**
+	 * Updates conversation title with optional confirmation dialog based on settings
+	 * @param convId - The conversation ID to update
+	 * @param newTitle - The new title content
+	 * @param onConfirmationNeeded - Callback when user confirmation is needed
+	 * @returns True if title was updated, false if cancelled
+	 */
+	async updateConversationTitleWithConfirmation(
+		convId: string,
+		newTitle: string,
+		onConfirmationNeeded?: (currentTitle: string, newTitle: string) => Promise<boolean>
+	): Promise<boolean> {
+		try {
+			const currentConfig = config();
+
+			if (currentConfig.askForTitleConfirmation && onConfirmationNeeded) {
+				const conversation = await DatabaseService.getConversation(convId);
+				if (!conversation) return false;
+
+				const shouldUpdate = await onConfirmationNeeded(conversation.name, newTitle);
+				if (!shouldUpdate) return false;
+			}
+
+			await this.updateConversationName(convId, newTitle);
+			return true;
+		} catch (error) {
+			console.error('Failed to update conversation title with confirmation:', error);
+			return false;
+		}
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Navigation
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Updates the current node of the active conversation
+	 * @param nodeId - The new current node ID
+	 */
+	async updateCurrentNode(nodeId: string): Promise<void> {
+		if (!this.activeConversation) return;
+
+		await DatabaseService.updateCurrentNode(this.activeConversation.id, nodeId);
+		this.activeConversation.currNode = nodeId;
+	}
+
+	/**
+	 * Updates conversation lastModified timestamp and moves it to top of list
+	 */
+	updateConversationTimestamp(): void {
+		if (!this.activeConversation) return;
+
+		const chatIndex = this.conversations.findIndex((c) => c.id === this.activeConversation!.id);
+
+		if (chatIndex !== -1) {
+			this.conversations[chatIndex].lastModified = Date.now();
+			const updatedConv = this.conversations.splice(chatIndex, 1)[0];
+			this.conversations.unshift(updatedConv);
+		}
+	}
+
+	/**
+	 * Navigates to a specific sibling branch by updating currNode and refreshing messages
+	 * @param siblingId - The sibling message ID to navigate to
+	 */
+	async navigateToSibling(siblingId: string): Promise<void> {
+		if (!this.activeConversation) return;
+
+		const allMessages = await DatabaseService.getConversationMessages(this.activeConversation.id);
+		const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
+		const currentFirstUserMessage = this.activeMessages.find(
+			(m) => m.role === 'user' && m.parent === rootMessage?.id
+		);
+
+		const currentLeafNodeId = findLeafNode(allMessages, siblingId);
+
+		await DatabaseService.updateCurrentNode(this.activeConversation.id, currentLeafNodeId);
+		this.activeConversation.currNode = currentLeafNodeId;
+		await this.refreshActiveMessages();
+
+		// Only show title dialog if we're navigating between different first user message siblings
+		if (rootMessage && this.activeMessages.length > 0) {
+			const newFirstUserMessage = this.activeMessages.find(
+				(m) => m.role === 'user' && m.parent === rootMessage.id
+			);
+
+			if (
+				newFirstUserMessage &&
+				newFirstUserMessage.content.trim() &&
+				(!currentFirstUserMessage ||
+					newFirstUserMessage.id !== currentFirstUserMessage.id ||
+					newFirstUserMessage.content.trim() !== currentFirstUserMessage.content.trim())
+			) {
+				await this.updateConversationTitleWithConfirmation(
+					this.activeConversation.id,
+					newFirstUserMessage.content.trim(),
+					this.titleUpdateConfirmationCallback
+				);
+			}
+		}
+	}
+
+	/**
+	 * Deletes a conversation and all its messages
+	 * @param convId - The conversation ID to delete
+	 */
+	async deleteConversation(convId: string): Promise<void> {
+		try {
+			await DatabaseService.deleteConversation(convId);
+
+			this.conversations = this.conversations.filter((c) => c.id !== convId);
+
+			if (this.activeConversation?.id === convId) {
+				this.clearActiveConversation();
+				await goto(`?new_chat=true#/`);
+			}
+		} catch (error) {
+			console.error('Failed to delete conversation:', error);
+		}
+	}
+
+	/**
+	 * Deletes all conversations and their messages
+	 */
+	async deleteAll(): Promise<void> {
+		try {
+			const allConversations = await DatabaseService.getAllConversations();
+
+			for (const conv of allConversations) {
+				await DatabaseService.deleteConversation(conv.id);
+			}
+
+			this.clearActiveConversation();
+			this.conversations = [];
+
+			toast.success('All conversations deleted');
+
+			await goto(`?new_chat=true#/`);
+		} catch (error) {
+			console.error('Failed to delete all conversations:', error);
+			toast.error('Failed to delete conversations');
+		}
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Import/Export
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Downloads a conversation as JSON file
+	 * @param convId - The conversation ID to download
+	 */
+	async downloadConversation(convId: string): Promise<void> {
+		let conversation: DatabaseConversation | null;
+		let messages: DatabaseMessage[];
+
+		if (this.activeConversation?.id === convId) {
+			conversation = this.activeConversation;
+			messages = this.activeMessages;
+		} else {
+			conversation = await DatabaseService.getConversation(convId);
+			if (!conversation) return;
+			messages = await DatabaseService.getConversationMessages(convId);
+		}
+
+		this.triggerDownload({ conv: conversation, messages });
+	}
+
+	/**
+	 * Exports all conversations with their messages as a JSON file
+	 * @returns The list of exported conversations
+	 */
+	async exportAllConversations(): Promise<DatabaseConversation[]> {
+		const allConversations = await DatabaseService.getAllConversations();
+
+		if (allConversations.length === 0) {
+			throw new Error('No conversations to export');
+		}
+
+		const allData = await Promise.all(
+			allConversations.map(async (conv) => {
+				const messages = await DatabaseService.getConversationMessages(conv.id);
+				return { conv, messages };
+			})
+		);
+
+		const blob = new Blob([JSON.stringify(allData, null, 2)], { type: 'application/json' });
+		const url = URL.createObjectURL(blob);
+		const a = document.createElement('a');
+		a.href = url;
+		a.download = `all_conversations_${new Date().toISOString().split('T')[0]}.json`;
+		document.body.appendChild(a);
+		a.click();
+		document.body.removeChild(a);
+		URL.revokeObjectURL(url);
+
+		toast.success(`All conversations (${allConversations.length}) prepared for download`);
+
+		return allConversations;
+	}
+
+	/**
+	 * Imports conversations from a JSON file
+	 * Opens file picker and processes the selected file
+	 * @returns The list of imported conversations
+	 */
+	async importConversations(): Promise<DatabaseConversation[]> {
+		return new Promise((resolve, reject) => {
+			const input = document.createElement('input');
+			input.type = 'file';
+			input.accept = '.json';
+
+			input.onchange = async (e) => {
+				const file = (e.target as HTMLInputElement)?.files?.[0];
+
+				if (!file) {
+					reject(new Error('No file selected'));
+					return;
+				}
+
+				try {
+					const text = await file.text();
+					const parsedData = JSON.parse(text);
+					let importedData: ExportedConversations;
+
+					if (Array.isArray(parsedData)) {
+						importedData = parsedData;
+					} else if (
+						parsedData &&
+						typeof parsedData === 'object' &&
+						'conv' in parsedData &&
+						'messages' in parsedData
+					) {
+						importedData = [parsedData];
+					} else {
+						throw new Error('Invalid file format');
+					}
+
+					const result = await DatabaseService.importConversations(importedData);
+					toast.success(`Imported ${result.imported} conversation(s), skipped ${result.skipped}`);
+
+					await this.loadConversations();
+
+					const importedConversations = (
+						Array.isArray(importedData) ? importedData : [importedData]
+					).map((item) => item.conv);
+
+					resolve(importedConversations);
+				} catch (err: unknown) {
+					const message = err instanceof Error ? err.message : 'Unknown error';
+					console.error('Failed to import conversations:', err);
+					toast.error('Import failed', { description: message });
+					reject(new Error(`Import failed: ${message}`));
+				}
+			};
+
+			input.click();
+		});
+	}
+
+	/**
+	 * Gets all messages for a specific conversation
+	 * @param convId - The conversation ID
+	 * @returns Array of messages
+	 */
+	async getConversationMessages(convId: string): Promise<DatabaseMessage[]> {
+		return await DatabaseService.getConversationMessages(convId);
+	}
+
+	/**
+	 * Imports conversations from provided data (without file picker)
+	 * @param data - Array of conversation data with messages
+	 * @returns Import result with counts
+	 */
+	async importConversationsData(
+		data: ExportedConversations
+	): Promise<{ imported: number; skipped: number }> {
+		const result = await DatabaseService.importConversations(data);
+		await this.loadConversations();
+		return result;
+	}
+
+	/**
+	 * Adds a message to the active messages array
+	 * Used by chatStore when creating new messages
+	 * @param message - The message to add
+	 */
+	addMessageToActive(message: DatabaseMessage): void {
+		this.activeMessages.push(message);
+	}
+
+	/**
+	 * Updates a message at a specific index in active messages
+	 * Creates a new object to trigger Svelte 5 reactivity
+	 * @param index - The index of the message to update
+	 * @param updates - Partial message data to update
+	 */
+	updateMessageAtIndex(index: number, updates: Partial<DatabaseMessage>): void {
+		if (index !== -1 && this.activeMessages[index]) {
+			// Create new object to trigger Svelte 5 reactivity
+			this.activeMessages[index] = { ...this.activeMessages[index], ...updates };
+		}
+	}
+
+	/**
+	 * Finds the index of a message in active messages
+	 * @param messageId - The message ID to find
+	 * @returns The index of the message, or -1 if not found
+	 */
+	findMessageIndex(messageId: string): number {
+		return this.activeMessages.findIndex((m) => m.id === messageId);
+	}
+
+	/**
+	 * Removes messages from active messages starting at an index
+	 * @param startIndex - The index to start removing from
+	 */
+	sliceActiveMessages(startIndex: number): void {
+		this.activeMessages = this.activeMessages.slice(0, startIndex);
+	}
+
+	/**
+	 * Removes a message from active messages by index
+	 * @param index - The index to remove
+	 * @returns The removed message or undefined
+	 */
+	removeMessageAtIndex(index: number): DatabaseMessage | undefined {
+		if (index !== -1) {
+			return this.activeMessages.splice(index, 1)[0];
+		}
+		return undefined;
+	}
+
+	/**
+	 * Triggers file download in browser
+	 * @param data - The data to download
+	 * @param filename - Optional filename for the download
+	 */
+	private triggerDownload(data: ExportedConversations, filename?: string): void {
+		const conversation =
+			'conv' in data ? data.conv : Array.isArray(data) ? data[0]?.conv : undefined;
+
+		if (!conversation) {
+			console.error('Invalid data: missing conversation');
+			return;
+		}
+
+		const conversationName = conversation.name?.trim() || '';
+		const truncatedSuffix = conversationName
+			.toLowerCase()
+			.replace(/[^a-z0-9]/gi, '_')
+			.replace(/_+/g, '_')
+			.substring(0, 20);
+		const downloadFilename = filename || `conversation_${conversation.id}_${truncatedSuffix}.json`;
+
+		const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
+		const url = URL.createObjectURL(blob);
+		const a = document.createElement('a');
+		a.href = url;
+		a.download = downloadFilename;
+		document.body.appendChild(a);
+		a.click();
+		document.body.removeChild(a);
+		URL.revokeObjectURL(url);
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Utilities
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Sets the callback function for title update confirmations
+	 * @param callback - Function to call when confirmation is needed
+	 */
+	setTitleUpdateConfirmationCallback(
+		callback: (currentTitle: string, newTitle: string) => Promise<boolean>
+	): void {
+		this.titleUpdateConfirmationCallback = callback;
+	}
+}
+
+export const conversationsStore = new ConversationsStore();
+
+export const conversations = () => conversationsStore.conversations;
+export const activeConversation = () => conversationsStore.activeConversation;
+export const activeMessages = () => conversationsStore.activeMessages;
+export const isConversationsInitialized = () => conversationsStore.isInitialized;
+export const usedModalities = () => conversationsStore.usedModalities;
diff --git a/llama.cpp/tools/server/webui/src/lib/stores/models.svelte.ts b/llama.cpp/tools/server/webui/src/lib/stores/models.svelte.ts
new file mode 100644
index 0000000..34b2640
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/stores/models.svelte.ts
@@ -0,0 +1,605 @@
+import { SvelteSet } from 'svelte/reactivity';
+import { ModelsService } from '$lib/services/models';
+import { PropsService } from '$lib/services/props';
+import { ServerModelStatus, ModelModality } from '$lib/enums';
+import { serverStore } from '$lib/stores/server.svelte';
+
+/**
+ * modelsStore - Reactive store for model management in both MODEL and ROUTER modes
+ *
+ * This store manages:
+ * - Available models list
+ * - Selected model for new conversations
+ * - Loaded models tracking (ROUTER mode)
+ * - Model usage tracking per conversation
+ * - Automatic unloading of unused models
+ *
+ * **Architecture & Relationships:**
+ * - **ModelsService**: Stateless service for model API communication
+ * - **PropsService**: Stateless service for props/modalities fetching
+ * - **modelsStore** (this class): Reactive store for model state
+ * - **conversationsStore**: Tracks which conversations use which models
+ *
+ * **API Inconsistency Workaround:**
+ * In MODEL mode, `/props` returns modalities for the single model.
+ * In ROUTER mode, `/props` has no modalities - must use `/props?model=<id>` per model.
+ * This store normalizes this behavior so consumers don't need to know the server mode.
+ *
+ * **Key Features:**
+ * - **MODEL mode**: Single model, always loaded
+ * - **ROUTER mode**: Multi-model with load/unload capability
+ * - **Auto-unload**: Automatically unloads models not used by any conversation
+ * - **Lazy loading**: ensureModelLoaded() loads models on demand
+ */
+class ModelsStore {
+	// ─────────────────────────────────────────────────────────────────────────────
+	// State
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	models = $state<ModelOption[]>([]);
+	routerModels = $state<ApiModelDataEntry[]>([]);
+	loading = $state(false);
+	updating = $state(false);
+	error = $state<string | null>(null);
+	selectedModelId = $state<string | null>(null);
+	selectedModelName = $state<string | null>(null);
+
+	private modelUsage = $state<Map<string, SvelteSet<string>>>(new Map());
+	private modelLoadingStates = $state<Map<string, boolean>>(new Map());
+
+	/**
+	 * Model-specific props cache
+	 * Key: modelId, Value: props data including modalities
+	 */
+	private modelPropsCache = $state<Map<string, ApiLlamaCppServerProps>>(new Map());
+	private modelPropsFetching = $state<Set<string>>(new Set());
+
+	/**
+	 * Version counter for props cache - used to trigger reactivity when props are updated
+	 */
+	propsCacheVersion = $state(0);
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Computed Getters
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	get selectedModel(): ModelOption | null {
+		if (!this.selectedModelId) return null;
+		return this.models.find((model) => model.id === this.selectedModelId) ?? null;
+	}
+
+	get loadedModelIds(): string[] {
+		return this.routerModels
+			.filter((m) => m.status.value === ServerModelStatus.LOADED)
+			.map((m) => m.id);
+	}
+
+	get loadingModelIds(): string[] {
+		return Array.from(this.modelLoadingStates.entries())
+			.filter(([, loading]) => loading)
+			.map(([id]) => id);
+	}
+
+	/**
+	 * Get model name in MODEL mode (single model).
+	 * Extracts from model_path or model_alias from server props.
+	 * In ROUTER mode, returns null (model is per-conversation).
+	 */
+	get singleModelName(): string | null {
+		if (serverStore.isRouterMode) return null;
+
+		const props = serverStore.props;
+		if (props?.model_alias) return props.model_alias;
+		if (!props?.model_path) return null;
+
+		return props.model_path.split(/(\\|\/)/).pop() || null;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Modalities
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Get modalities for a specific model
+	 * Returns cached modalities from model props
+	 */
+	getModelModalities(modelId: string): ModelModalities | null {
+		// First check if modalities are stored in the model option
+		const model = this.models.find((m) => m.model === modelId || m.id === modelId);
+		if (model?.modalities) {
+			return model.modalities;
+		}
+
+		// Fall back to props cache
+		const props = this.modelPropsCache.get(modelId);
+		if (props?.modalities) {
+			return {
+				vision: props.modalities.vision ?? false,
+				audio: props.modalities.audio ?? false
+			};
+		}
+
+		return null;
+	}
+
+	/**
+	 * Check if a model supports vision modality
+	 */
+	modelSupportsVision(modelId: string): boolean {
+		return this.getModelModalities(modelId)?.vision ?? false;
+	}
+
+	/**
+	 * Check if a model supports audio modality
+	 */
+	modelSupportsAudio(modelId: string): boolean {
+		return this.getModelModalities(modelId)?.audio ?? false;
+	}
+
+	/**
+	 * Get model modalities as an array of ModelModality enum values
+	 */
+	getModelModalitiesArray(modelId: string): ModelModality[] {
+		const modalities = this.getModelModalities(modelId);
+		if (!modalities) return [];
+
+		const result: ModelModality[] = [];
+
+		if (modalities.vision) result.push(ModelModality.VISION);
+		if (modalities.audio) result.push(ModelModality.AUDIO);
+
+		return result;
+	}
+
+	/**
+	 * Get props for a specific model (from cache)
+	 */
+	getModelProps(modelId: string): ApiLlamaCppServerProps | null {
+		return this.modelPropsCache.get(modelId) ?? null;
+	}
+
+	/**
+	 * Get context size (n_ctx) for a specific model from cached props
+	 */
+	getModelContextSize(modelId: string): number | null {
+		const props = this.modelPropsCache.get(modelId);
+		return props?.default_generation_settings?.n_ctx ?? null;
+	}
+
+	/**
+	 * Get context size for the currently selected model or null if no model is selected
+	 */
+	get selectedModelContextSize(): number | null {
+		if (!this.selectedModelName) return null;
+		return this.getModelContextSize(this.selectedModelName);
+	}
+
+	/**
+	 * Check if props are being fetched for a model
+	 */
+	isModelPropsFetching(modelId: string): boolean {
+		return this.modelPropsFetching.has(modelId);
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Status Queries
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	isModelLoaded(modelId: string): boolean {
+		const model = this.routerModels.find((m) => m.id === modelId);
+		return model?.status.value === ServerModelStatus.LOADED || false;
+	}
+
+	isModelOperationInProgress(modelId: string): boolean {
+		return this.modelLoadingStates.get(modelId) ?? false;
+	}
+
+	getModelStatus(modelId: string): ServerModelStatus | null {
+		const model = this.routerModels.find((m) => m.id === modelId);
+		return model?.status.value ?? null;
+	}
+
+	getModelUsage(modelId: string): SvelteSet<string> {
+		return this.modelUsage.get(modelId) ?? new SvelteSet<string>();
+	}
+
+	isModelInUse(modelId: string): boolean {
+		const usage = this.modelUsage.get(modelId);
+		return usage !== undefined && usage.size > 0;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Data Fetching
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Fetch list of models from server and detect server role
+	 * Also fetches modalities for MODEL mode (single model)
+	 */
+	async fetch(force = false): Promise<void> {
+		if (this.loading) return;
+		if (this.models.length > 0 && !force) return;
+
+		this.loading = true;
+		this.error = null;
+
+		try {
+			// Ensure server props are loaded (for role detection and MODEL mode modalities)
+			if (!serverStore.props) {
+				await serverStore.fetch();
+			}
+
+			const response = await ModelsService.list();
+
+			const models: ModelOption[] = response.data.map((item: ApiModelDataEntry, index: number) => {
+				const details = response.models?.[index];
+				const rawCapabilities = Array.isArray(details?.capabilities) ? details?.capabilities : [];
+				const displayNameSource =
+					details?.name && details.name.trim().length > 0 ? details.name : item.id;
+				const displayName = this.toDisplayName(displayNameSource);
+
+				return {
+					id: item.id,
+					name: displayName,
+					model: details?.model || item.id,
+					description: details?.description,
+					capabilities: rawCapabilities.filter((value: unknown): value is string => Boolean(value)),
+					details: details?.details,
+					meta: item.meta ?? null
+				} satisfies ModelOption;
+			});
+
+			this.models = models;
+
+			// In MODEL mode, populate modalities from serverStore.props (single model)
+			// WORKAROUND: In MODEL mode, /props returns modalities for the single model,
+			// but /v1/models doesn't include modalities. We bridge this gap here.
+			const serverProps = serverStore.props;
+			if (serverStore.isModelMode && this.models.length > 0 && serverProps?.modalities) {
+				const modalities: ModelModalities = {
+					vision: serverProps.modalities.vision ?? false,
+					audio: serverProps.modalities.audio ?? false
+				};
+				// Cache props for the single model
+				this.modelPropsCache.set(this.models[0].model, serverProps);
+				// Update model with modalities
+				this.models = this.models.map((model, index) =>
+					index === 0 ? { ...model, modalities } : model
+				);
+			}
+		} catch (error) {
+			this.models = [];
+			this.error = error instanceof Error ? error.message : 'Failed to load models';
+			throw error;
+		} finally {
+			this.loading = false;
+		}
+	}
+
+	/**
+	 * Fetch router models with full metadata (ROUTER mode only)
+	 * This fetches the /models endpoint which returns status info for each model
+	 */
+	async fetchRouterModels(): Promise<void> {
+		try {
+			const response = await ModelsService.listRouter();
+			this.routerModels = response.data;
+			await this.fetchModalitiesForLoadedModels();
+		} catch (error) {
+			console.warn('Failed to fetch router models:', error);
+			this.routerModels = [];
+		}
+	}
+
+	/**
+	 * Fetch props for a specific model from /props endpoint
+	 * Uses caching to avoid redundant requests
+	 *
+	 * In ROUTER mode, this will only fetch props if the model is loaded,
+	 * since unloaded models return 400 from /props endpoint.
+	 *
+	 * @param modelId - Model identifier to fetch props for
+	 * @returns Props data or null if fetch failed or model not loaded
+	 */
+	async fetchModelProps(modelId: string): Promise<ApiLlamaCppServerProps | null> {
+		// Return cached props if available
+		const cached = this.modelPropsCache.get(modelId);
+		if (cached) return cached;
+
+		if (serverStore.isRouterMode && !this.isModelLoaded(modelId)) {
+			return null;
+		}
+
+		// Avoid duplicate fetches
+		if (this.modelPropsFetching.has(modelId)) return null;
+
+		this.modelPropsFetching.add(modelId);
+
+		try {
+			const props = await PropsService.fetchForModel(modelId);
+			this.modelPropsCache.set(modelId, props);
+			return props;
+		} catch (error) {
+			console.warn(`Failed to fetch props for model ${modelId}:`, error);
+			return null;
+		} finally {
+			this.modelPropsFetching.delete(modelId);
+		}
+	}
+
+	/**
+	 * Fetch modalities for all loaded models from /props endpoint
+	 * This updates the modalities field in models array
+	 */
+	async fetchModalitiesForLoadedModels(): Promise<void> {
+		const loadedModelIds = this.loadedModelIds;
+		if (loadedModelIds.length === 0) return;
+
+		// Fetch props for each loaded model in parallel
+		const propsPromises = loadedModelIds.map((modelId) => this.fetchModelProps(modelId));
+
+		try {
+			const results = await Promise.all(propsPromises);
+
+			// Update models with modalities
+			this.models = this.models.map((model) => {
+				const modelIndex = loadedModelIds.indexOf(model.model);
+				if (modelIndex === -1) return model;
+
+				const props = results[modelIndex];
+				if (!props?.modalities) return model;
+
+				const modalities: ModelModalities = {
+					vision: props.modalities.vision ?? false,
+					audio: props.modalities.audio ?? false
+				};
+
+				return { ...model, modalities };
+			});
+
+			// Increment version to trigger reactivity
+			this.propsCacheVersion++;
+		} catch (error) {
+			console.warn('Failed to fetch modalities for loaded models:', error);
+		}
+	}
+
+	/**
+	 * Update modalities for a specific model
+	 * Called when a model is loaded or when we need fresh modality data
+	 */
+	async updateModelModalities(modelId: string): Promise<void> {
+		try {
+			const props = await this.fetchModelProps(modelId);
+			if (!props?.modalities) return;
+
+			const modalities: ModelModalities = {
+				vision: props.modalities.vision ?? false,
+				audio: props.modalities.audio ?? false
+			};
+
+			this.models = this.models.map((model) =>
+				model.model === modelId ? { ...model, modalities } : model
+			);
+
+			// Increment version to trigger reactivity
+			this.propsCacheVersion++;
+		} catch (error) {
+			console.warn(`Failed to update modalities for model ${modelId}:`, error);
+		}
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Model Selection
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Select a model for new conversations
+	 */
+	async selectModelById(modelId: string): Promise<void> {
+		if (!modelId || this.updating) return;
+		if (this.selectedModelId === modelId) return;
+
+		const option = this.models.find((model) => model.id === modelId);
+		if (!option) throw new Error('Selected model is not available');
+
+		this.updating = true;
+		this.error = null;
+
+		try {
+			this.selectedModelId = option.id;
+			this.selectedModelName = option.model;
+		} finally {
+			this.updating = false;
+		}
+	}
+
+	/**
+	 * Select a model by its model name (used for syncing with conversation model)
+	 * @param modelName - Model name to select (e.g., "unsloth/gemma-3-12b-it-GGUF:latest")
+	 */
+	selectModelByName(modelName: string): void {
+		const option = this.models.find((model) => model.model === modelName);
+		if (option) {
+			this.selectedModelId = option.id;
+			this.selectedModelName = option.model;
+		}
+	}
+
+	clearSelection(): void {
+		this.selectedModelId = null;
+		this.selectedModelName = null;
+	}
+
+	findModelByName(modelName: string): ModelOption | null {
+		return this.models.find((model) => model.model === modelName) ?? null;
+	}
+
+	findModelById(modelId: string): ModelOption | null {
+		return this.models.find((model) => model.id === modelId) ?? null;
+	}
+
+	hasModel(modelName: string): boolean {
+		return this.models.some((model) => model.model === modelName);
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Loading/Unloading Models
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * WORKAROUND: Polling for model status after load/unload operations.
+	 *
+	 * Currently, the `/models/load` and `/models/unload` endpoints return success
+	 * before the operation actually completes on the server. This means an immediate
+	 * request to `/models` returns stale status (e.g., "loading" after load request,
+	 * "loaded" after unload request).
+	 *
+	 * TODO: Remove this polling once llama-server properly waits for the operation
+	 * to complete before returning success from `/load` and `/unload` endpoints.
+	 * At that point, a single `fetchRouterModels()` call after the operation will
+	 * be sufficient to get the correct status.
+	 */
+
+	/** Polling interval in ms for checking model status */
+	private static readonly STATUS_POLL_INTERVAL = 500;
+	/** Maximum polling attempts before giving up */
+	private static readonly STATUS_POLL_MAX_ATTEMPTS = 60; // 30 seconds max
+
+	/**
+	 * Poll for expected model status after load/unload operation.
+	 * Keeps polling until the model reaches the expected status or max attempts reached.
+	 *
+	 * @param modelId - Model identifier to check
+	 * @param expectedStatus - Expected status to wait for
+	 * @returns Promise that resolves when expected status is reached
+	 */
+	private async pollForModelStatus(
+		modelId: string,
+		expectedStatus: ServerModelStatus
+	): Promise<void> {
+		for (let attempt = 0; attempt < ModelsStore.STATUS_POLL_MAX_ATTEMPTS; attempt++) {
+			await this.fetchRouterModels();
+
+			const currentStatus = this.getModelStatus(modelId);
+			if (currentStatus === expectedStatus) {
+				return;
+			}
+
+			// Wait before next poll
+			await new Promise((resolve) => setTimeout(resolve, ModelsStore.STATUS_POLL_INTERVAL));
+		}
+
+		console.warn(
+			`Model ${modelId} did not reach expected status ${expectedStatus} after ${ModelsStore.STATUS_POLL_MAX_ATTEMPTS} attempts`
+		);
+	}
+
+	/**
+	 * Load a model (ROUTER mode)
+	 * @param modelId - Model identifier to load
+	 */
+	async loadModel(modelId: string): Promise<void> {
+		if (this.isModelLoaded(modelId)) {
+			return;
+		}
+
+		if (this.modelLoadingStates.get(modelId)) return;
+
+		this.modelLoadingStates.set(modelId, true);
+		this.error = null;
+
+		try {
+			await ModelsService.load(modelId);
+
+			// Poll until model is loaded
+			await this.pollForModelStatus(modelId, ServerModelStatus.LOADED);
+
+			await this.updateModelModalities(modelId);
+		} catch (error) {
+			this.error = error instanceof Error ? error.message : 'Failed to load model';
+			throw error;
+		} finally {
+			this.modelLoadingStates.set(modelId, false);
+		}
+	}
+
+	/**
+	 * Unload a model (ROUTER mode)
+	 * @param modelId - Model identifier to unload
+	 */
+	async unloadModel(modelId: string): Promise<void> {
+		if (!this.isModelLoaded(modelId)) {
+			return;
+		}
+
+		if (this.modelLoadingStates.get(modelId)) return;
+
+		this.modelLoadingStates.set(modelId, true);
+		this.error = null;
+
+		try {
+			await ModelsService.unload(modelId);
+
+			await this.pollForModelStatus(modelId, ServerModelStatus.UNLOADED);
+		} catch (error) {
+			this.error = error instanceof Error ? error.message : 'Failed to unload model';
+			throw error;
+		} finally {
+			this.modelLoadingStates.set(modelId, false);
+		}
+	}
+
+	/**
+	 * Ensure a model is loaded before use
+	 * @param modelId - Model identifier to ensure is loaded
+	 */
+	async ensureModelLoaded(modelId: string): Promise<void> {
+		if (this.isModelLoaded(modelId)) {
+			return;
+		}
+
+		await this.loadModel(modelId);
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Utilities
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	private toDisplayName(id: string): string {
+		const segments = id.split(/\\|\//);
+		const candidate = segments.pop();
+
+		return candidate && candidate.trim().length > 0 ? candidate : id;
+	}
+
+	clear(): void {
+		this.models = [];
+		this.routerModels = [];
+		this.loading = false;
+		this.updating = false;
+		this.error = null;
+		this.selectedModelId = null;
+		this.selectedModelName = null;
+		this.modelUsage.clear();
+		this.modelLoadingStates.clear();
+		this.modelPropsCache.clear();
+		this.modelPropsFetching.clear();
+	}
+}
+
+export const modelsStore = new ModelsStore();
+
+export const modelOptions = () => modelsStore.models;
+export const routerModels = () => modelsStore.routerModels;
+export const modelsLoading = () => modelsStore.loading;
+export const modelsUpdating = () => modelsStore.updating;
+export const modelsError = () => modelsStore.error;
+export const selectedModelId = () => modelsStore.selectedModelId;
+export const selectedModelName = () => modelsStore.selectedModelName;
+export const selectedModelOption = () => modelsStore.selectedModel;
+export const loadedModelIds = () => modelsStore.loadedModelIds;
+export const loadingModelIds = () => modelsStore.loadingModelIds;
+export const propsCacheVersion = () => modelsStore.propsCacheVersion;
+export const singleModelName = () => modelsStore.singleModelName;
+export const selectedModelContextSize = () => modelsStore.selectedModelContextSize;
diff --git a/llama.cpp/tools/server/webui/src/lib/stores/persisted.svelte.ts b/llama.cpp/tools/server/webui/src/lib/stores/persisted.svelte.ts
new file mode 100644
index 0000000..1e07f80
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/stores/persisted.svelte.ts
@@ -0,0 +1,50 @@
+import { browser } from '$app/environment';
+
+type PersistedValue<T> = {
+	get value(): T;
+	set value(newValue: T);
+};
+
+export function persisted<T>(key: string, initialValue: T): PersistedValue<T> {
+	let value = initialValue;
+
+	if (browser) {
+		try {
+			const stored = localStorage.getItem(key);
+
+			if (stored !== null) {
+				value = JSON.parse(stored) as T;
+			}
+		} catch (error) {
+			console.warn(`Failed to load ${key}:`, error);
+		}
+	}
+
+	const persist = (next: T) => {
+		if (!browser) {
+			return;
+		}
+
+		try {
+			if (next === null || next === undefined) {
+				localStorage.removeItem(key);
+				return;
+			}
+
+			localStorage.setItem(key, JSON.stringify(next));
+		} catch (error) {
+			console.warn(`Failed to persist ${key}:`, error);
+		}
+	};
+
+	return {
+		get value() {
+			return value;
+		},
+
+		set value(newValue: T) {
+			value = newValue;
+			persist(newValue);
+		}
+	};
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/stores/server.svelte.ts b/llama.cpp/tools/server/webui/src/lib/stores/server.svelte.ts
new file mode 100644
index 0000000..facfd33
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/stores/server.svelte.ts
@@ -0,0 +1,140 @@
+import { PropsService } from '$lib/services/props';
+import { ServerRole } from '$lib/enums';
+
+/**
+ * serverStore - Server connection state, configuration, and role detection
+ *
+ * This store manages the server connection state and properties fetched from `/props`.
+ * It provides reactive state for server configuration and role detection.
+ *
+ * **Architecture & Relationships:**
+ * - **PropsService**: Stateless service for fetching `/props` data
+ * - **serverStore** (this class): Reactive store for server state
+ * - **modelsStore**: Independent store for model management (uses PropsService directly)
+ *
+ * **Key Features:**
+ * - **Server State**: Connection status, loading, error handling
+ * - **Role Detection**: MODEL (single model) vs ROUTER (multi-model)
+ * - **Default Params**: Server-wide generation defaults
+ */
+class ServerStore {
+	// ─────────────────────────────────────────────────────────────────────────────
+	// State
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	props = $state<ApiLlamaCppServerProps | null>(null);
+	loading = $state(false);
+	error = $state<string | null>(null);
+	role = $state<ServerRole | null>(null);
+	private fetchPromise: Promise<void> | null = null;
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Getters
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	get defaultParams(): ApiLlamaCppServerProps['default_generation_settings']['params'] | null {
+		return this.props?.default_generation_settings?.params || null;
+	}
+
+	get contextSize(): number | null {
+		return this.props?.default_generation_settings?.n_ctx ?? null;
+	}
+
+	get webuiSettings(): Record<string, string | number | boolean> | undefined {
+		return this.props?.webui_settings;
+	}
+
+	get isRouterMode(): boolean {
+		return this.role === ServerRole.ROUTER;
+	}
+
+	get isModelMode(): boolean {
+		return this.role === ServerRole.MODEL;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Data Handling
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	async fetch(): Promise<void> {
+		if (this.fetchPromise) return this.fetchPromise;
+
+		this.loading = true;
+		this.error = null;
+
+		const fetchPromise = (async () => {
+			try {
+				const props = await PropsService.fetch();
+				this.props = props;
+				this.error = null;
+				this.detectRole(props);
+			} catch (error) {
+				this.error = this.getErrorMessage(error);
+				console.error('Error fetching server properties:', error);
+			} finally {
+				this.loading = false;
+				this.fetchPromise = null;
+			}
+		})();
+
+		this.fetchPromise = fetchPromise;
+		await fetchPromise;
+	}
+
+	private getErrorMessage(error: unknown): string {
+		if (error instanceof Error) {
+			const message = error.message || '';
+
+			if (error.name === 'TypeError' && message.includes('fetch')) {
+				return 'Server is not running or unreachable';
+			} else if (message.includes('ECONNREFUSED')) {
+				return 'Connection refused - server may be offline';
+			} else if (message.includes('ENOTFOUND')) {
+				return 'Server not found - check server address';
+			} else if (message.includes('ETIMEDOUT')) {
+				return 'Request timed out';
+			} else if (message.includes('503')) {
+				return 'Server temporarily unavailable';
+			} else if (message.includes('500')) {
+				return 'Server error - check server logs';
+			} else if (message.includes('404')) {
+				return 'Server endpoint not found';
+			} else if (message.includes('403') || message.includes('401')) {
+				return 'Access denied';
+			}
+		}
+
+		return 'Failed to connect to server';
+	}
+
+	clear(): void {
+		this.props = null;
+		this.error = null;
+		this.loading = false;
+		this.role = null;
+		this.fetchPromise = null;
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Utilities
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	private detectRole(props: ApiLlamaCppServerProps): void {
+		const newRole = props?.role === ServerRole.ROUTER ? ServerRole.ROUTER : ServerRole.MODEL;
+		if (this.role !== newRole) {
+			this.role = newRole;
+			console.info(`Server running in ${newRole === ServerRole.ROUTER ? 'ROUTER' : 'MODEL'} mode`);
+		}
+	}
+}
+
+export const serverStore = new ServerStore();
+
+export const serverProps = () => serverStore.props;
+export const serverLoading = () => serverStore.loading;
+export const serverError = () => serverStore.error;
+export const serverRole = () => serverStore.role;
+export const defaultParams = () => serverStore.defaultParams;
+export const contextSize = () => serverStore.contextSize;
+export const isRouterMode = () => serverStore.isRouterMode;
+export const isModelMode = () => serverStore.isModelMode;
diff --git a/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts b/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts
new file mode 100644
index 0000000..cda940b
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts
@@ -0,0 +1,421 @@
+/**
+ * settingsStore - Application configuration and theme management
+ *
+ * This store manages all application settings including AI model parameters, UI preferences,
+ * and theme configuration. It provides persistent storage through localStorage with reactive
+ * state management using Svelte 5 runes.
+ *
+ * **Architecture & Relationships:**
+ * - **settingsStore** (this class): Configuration state management
+ *   - Manages AI model parameters (temperature, max tokens, etc.)
+ *   - Handles theme switching and persistence
+ *   - Provides localStorage synchronization
+ *   - Offers reactive configuration access
+ *
+ * - **ChatService**: Reads model parameters for API requests
+ * - **UI Components**: Subscribe to theme and configuration changes
+ *
+ * **Key Features:**
+ * - **Model Parameters**: Temperature, max tokens, top-p, top-k, repeat penalty
+ * - **Theme Management**: Auto, light, dark theme switching
+ * - **Persistence**: Automatic localStorage synchronization
+ * - **Reactive State**: Svelte 5 runes for automatic UI updates
+ * - **Default Handling**: Graceful fallback to defaults for missing settings
+ * - **Batch Updates**: Efficient multi-setting updates
+ * - **Reset Functionality**: Restore defaults for individual or all settings
+ *
+ * **Configuration Categories:**
+ * - Generation parameters (temperature, tokens, sampling)
+ * - UI preferences (theme, display options)
+ * - System settings (model selection, prompts)
+ * - Advanced options (seed, penalties, context handling)
+ */
+
+import { browser } from '$app/environment';
+import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
+import { ParameterSyncService } from '$lib/services/parameter-sync';
+import { serverStore } from '$lib/stores/server.svelte';
+import {
+	configToParameterRecord,
+	normalizeFloatingPoint,
+	getConfigValue,
+	setConfigValue
+} from '$lib/utils';
+import {
+	CONFIG_LOCALSTORAGE_KEY,
+	USER_OVERRIDES_LOCALSTORAGE_KEY
+} from '$lib/constants/localstorage-keys';
+
+class SettingsStore {
+	// ─────────────────────────────────────────────────────────────────────────────
+	// State
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	config = $state<SettingsConfigType>({ ...SETTING_CONFIG_DEFAULT });
+	theme = $state<string>('auto');
+	isInitialized = $state(false);
+	userOverrides = $state<Set<string>>(new Set());
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Utilities (private helpers)
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Helper method to get server defaults with null safety
+	 * Centralizes the pattern of getting and extracting server defaults
+	 */
+	private getServerDefaults(): Record<string, string | number | boolean> {
+		const serverParams = serverStore.defaultParams;
+		const webuiSettings = serverStore.webuiSettings;
+		return ParameterSyncService.extractServerDefaults(serverParams, webuiSettings);
+	}
+
+	constructor() {
+		if (browser) {
+			this.initialize();
+		}
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Lifecycle
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Initialize the settings store by loading from localStorage
+	 */
+	initialize() {
+		try {
+			this.loadConfig();
+			this.loadTheme();
+			this.isInitialized = true;
+		} catch (error) {
+			console.error('Failed to initialize settings store:', error);
+		}
+	}
+
+	/**
+	 * Load configuration from localStorage
+	 * Returns default values for missing keys to prevent breaking changes
+	 */
+	private loadConfig() {
+		if (!browser) return;
+
+		try {
+			const storedConfigRaw = localStorage.getItem(CONFIG_LOCALSTORAGE_KEY);
+			const savedVal = JSON.parse(storedConfigRaw || '{}');
+
+			// Merge with defaults to prevent breaking changes
+			this.config = {
+				...SETTING_CONFIG_DEFAULT,
+				...savedVal
+			};
+
+			// Load user overrides
+			const savedOverrides = JSON.parse(
+				localStorage.getItem(USER_OVERRIDES_LOCALSTORAGE_KEY) || '[]'
+			);
+			this.userOverrides = new Set(savedOverrides);
+		} catch (error) {
+			console.warn('Failed to parse config from localStorage, using defaults:', error);
+			this.config = { ...SETTING_CONFIG_DEFAULT };
+			this.userOverrides = new Set();
+		}
+	}
+
+	/**
+	 * Load theme from localStorage
+	 */
+	private loadTheme() {
+		if (!browser) return;
+
+		this.theme = localStorage.getItem('theme') || 'auto';
+	}
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Config Updates
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Update a specific configuration setting
+	 * @param key - The configuration key to update
+	 * @param value - The new value for the configuration key
+	 */
+	updateConfig<K extends keyof SettingsConfigType>(key: K, value: SettingsConfigType[K]): void {
+		this.config[key] = value;
+
+		if (ParameterSyncService.canSyncParameter(key as string)) {
+			const propsDefaults = this.getServerDefaults();
+			const propsDefault = propsDefaults[key as string];
+
+			if (propsDefault !== undefined) {
+				const normalizedValue = normalizeFloatingPoint(value);
+				const normalizedDefault = normalizeFloatingPoint(propsDefault);
+
+				if (normalizedValue === normalizedDefault) {
+					this.userOverrides.delete(key as string);
+				} else {
+					this.userOverrides.add(key as string);
+				}
+			}
+		}
+
+		this.saveConfig();
+	}
+
+	/**
+	 * Update multiple configuration settings at once
+	 * @param updates - Object containing the configuration updates
+	 */
+	updateMultipleConfig(updates: Partial<SettingsConfigType>) {
+		Object.assign(this.config, updates);
+
+		const propsDefaults = this.getServerDefaults();
+
+		for (const [key, value] of Object.entries(updates)) {
+			if (ParameterSyncService.canSyncParameter(key)) {
+				const propsDefault = propsDefaults[key];
+
+				if (propsDefault !== undefined) {
+					const normalizedValue = normalizeFloatingPoint(value);
+					const normalizedDefault = normalizeFloatingPoint(propsDefault);
+
+					if (normalizedValue === normalizedDefault) {
+						this.userOverrides.delete(key);
+					} else {
+						this.userOverrides.add(key);
+					}
+				}
+			}
+		}
+
+		this.saveConfig();
+	}
+
+	/**
+	 * Save the current configuration to localStorage
+	 */
+	private saveConfig() {
+		if (!browser) return;
+
+		try {
+			localStorage.setItem(CONFIG_LOCALSTORAGE_KEY, JSON.stringify(this.config));
+
+			localStorage.setItem(
+				USER_OVERRIDES_LOCALSTORAGE_KEY,
+				JSON.stringify(Array.from(this.userOverrides))
+			);
+		} catch (error) {
+			console.error('Failed to save config to localStorage:', error);
+		}
+	}
+
+	/**
+	 * Update the theme setting
+	 * @param newTheme - The new theme value
+	 */
+	updateTheme(newTheme: string) {
+		this.theme = newTheme;
+		this.saveTheme();
+	}
+
+	/**
+	 * Save the current theme to localStorage
+	 */
+	private saveTheme() {
+		if (!browser) return;
+
+		try {
+			if (this.theme === 'auto') {
+				localStorage.removeItem('theme');
+			} else {
+				localStorage.setItem('theme', this.theme);
+			}
+		} catch (error) {
+			console.error('Failed to save theme to localStorage:', error);
+		}
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Reset
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Reset configuration to defaults
+	 */
+	resetConfig() {
+		this.config = { ...SETTING_CONFIG_DEFAULT };
+		this.saveConfig();
+	}
+
+	/**
+	 * Reset theme to auto
+	 */
+	resetTheme() {
+		this.theme = 'auto';
+		this.saveTheme();
+	}
+
+	/**
+	 * Reset all settings to defaults
+	 */
+	resetAll() {
+		this.resetConfig();
+		this.resetTheme();
+	}
+
+	/**
+	 * Reset a parameter to server default (or webui default if no server default)
+	 */
+	resetParameterToServerDefault(key: string): void {
+		const serverDefaults = this.getServerDefaults();
+
+		if (serverDefaults[key] !== undefined) {
+			const value = normalizeFloatingPoint(serverDefaults[key]);
+
+			this.config[key as keyof SettingsConfigType] =
+				value as SettingsConfigType[keyof SettingsConfigType];
+		} else {
+			if (key in SETTING_CONFIG_DEFAULT) {
+				const defaultValue = getConfigValue(SETTING_CONFIG_DEFAULT, key);
+
+				setConfigValue(this.config, key, defaultValue);
+			}
+		}
+
+		this.userOverrides.delete(key);
+		this.saveConfig();
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Server Sync
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Initialize settings with props defaults when server properties are first loaded
+	 * This sets up the default values from /props endpoint
+	 */
+	syncWithServerDefaults(): void {
+		const propsDefaults = this.getServerDefaults();
+
+		if (Object.keys(propsDefaults).length === 0) {
+			console.warn('No server defaults available for initialization');
+
+			return;
+		}
+
+		for (const [key, propsValue] of Object.entries(propsDefaults)) {
+			const currentValue = getConfigValue(this.config, key);
+
+			const normalizedCurrent = normalizeFloatingPoint(currentValue);
+			const normalizedDefault = normalizeFloatingPoint(propsValue);
+
+			if (normalizedCurrent === normalizedDefault) {
+				this.userOverrides.delete(key);
+				setConfigValue(this.config, key, propsValue);
+			} else if (!this.userOverrides.has(key)) {
+				setConfigValue(this.config, key, propsValue);
+			}
+		}
+
+		this.saveConfig();
+		console.log('Settings initialized with props defaults:', propsDefaults);
+		console.log('Current user overrides after sync:', Array.from(this.userOverrides));
+	}
+
+	/**
+	 * Reset all parameters to their default values (from props)
+	 * This is used by the "Reset to Default" functionality
+	 * Prioritizes server defaults from /props, falls back to webui defaults
+	 */
+	forceSyncWithServerDefaults(): void {
+		const propsDefaults = this.getServerDefaults();
+		const syncableKeys = ParameterSyncService.getSyncableParameterKeys();
+
+		for (const key of syncableKeys) {
+			if (propsDefaults[key] !== undefined) {
+				const normalizedValue = normalizeFloatingPoint(propsDefaults[key]);
+
+				setConfigValue(this.config, key, normalizedValue);
+			} else {
+				if (key in SETTING_CONFIG_DEFAULT) {
+					const defaultValue = getConfigValue(SETTING_CONFIG_DEFAULT, key);
+
+					setConfigValue(this.config, key, defaultValue);
+				}
+			}
+
+			this.userOverrides.delete(key);
+		}
+
+		this.saveConfig();
+	}
+
+	// ─────────────────────────────────────────────────────────────────────────────
+	// Utilities
+	// ─────────────────────────────────────────────────────────────────────────────
+
+	/**
+	 * Get a specific configuration value
+	 * @param key - The configuration key to get
+	 * @returns The configuration value
+	 */
+	getConfig<K extends keyof SettingsConfigType>(key: K): SettingsConfigType[K] {
+		return this.config[key];
+	}
+
+	/**
+	 * Get the entire configuration object
+	 * @returns The complete configuration object
+	 */
+	getAllConfig(): SettingsConfigType {
+		return { ...this.config };
+	}
+
+	canSyncParameter(key: string): boolean {
+		return ParameterSyncService.canSyncParameter(key);
+	}
+
+	/**
+	 * Get parameter information including source for a specific parameter
+	 */
+	getParameterInfo(key: string) {
+		const propsDefaults = this.getServerDefaults();
+		const currentValue = getConfigValue(this.config, key);
+
+		return ParameterSyncService.getParameterInfo(
+			key,
+			currentValue ?? '',
+			propsDefaults,
+			this.userOverrides
+		);
+	}
+
+	/**
+	 * Get diff between current settings and server defaults
+	 */
+	getParameterDiff() {
+		const serverDefaults = this.getServerDefaults();
+		if (Object.keys(serverDefaults).length === 0) return {};
+
+		const configAsRecord = configToParameterRecord(
+			this.config,
+			ParameterSyncService.getSyncableParameterKeys()
+		);
+
+		return ParameterSyncService.createParameterDiff(configAsRecord, serverDefaults);
+	}
+
+	/**
+	 * Clear all user overrides (for debugging)
+	 */
+	clearAllUserOverrides(): void {
+		this.userOverrides.clear();
+		this.saveConfig();
+		console.log('Cleared all user overrides');
+	}
+}
+
+export const settingsStore = new SettingsStore();
+
+export const config = () => settingsStore.config;
+export const theme = () => settingsStore.theme;
+export const isInitialized = () => settingsStore.isInitialized;
diff --git a/llama.cpp/tools/server/webui/src/lib/types/api.d.ts b/llama.cpp/tools/server/webui/src/lib/types/api.d.ts
new file mode 100644
index 0000000..714509f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/types/api.d.ts
@@ -0,0 +1,430 @@
+import type { ServerModelStatus, ServerRole } from '$lib/enums';
+import type { ChatMessagePromptProgress } from './chat';
+
+export interface ApiChatMessageContentPart {
+	type: 'text' | 'image_url' | 'input_audio';
+	text?: string;
+	image_url?: {
+		url: string;
+	};
+	input_audio?: {
+		data: string;
+		format: 'wav' | 'mp3';
+	};
+}
+
+export interface ApiContextSizeError {
+	code: number;
+	message: string;
+	type: 'exceed_context_size_error';
+	n_prompt_tokens: number;
+	n_ctx: number;
+}
+
+export interface ApiErrorResponse {
+	error:
+		| ApiContextSizeError
+		| {
+				code: number;
+				message: string;
+				type?: string;
+		  };
+}
+
+export interface ApiChatMessageData {
+	role: ChatRole;
+	content: string | ApiChatMessageContentPart[];
+	timestamp?: number;
+}
+
+/**
+ * Model status object from /models endpoint
+ */
+export interface ApiModelStatus {
+	/** Status value: loaded, unloaded, loading, failed */
+	value: ServerModelStatus;
+	/** Command line arguments used when loading (only for loaded models) */
+	args?: string[];
+}
+
+/**
+ * Model entry from /models endpoint (ROUTER mode)
+ * Based on actual API response structure
+ */
+export interface ApiModelDataEntry {
+	/** Model identifier (e.g., "ggml-org/Qwen2.5-Omni-7B-GGUF:latest") */
+	id: string;
+	/** Model name (optional, usually same as id - not always returned by API) */
+	name?: string;
+	/** Object type, always "model" */
+	object: string;
+	/** Owner, usually "llamacpp" */
+	owned_by: string;
+	/** Creation timestamp */
+	created: number;
+	/** Whether model files are in HuggingFace cache */
+	in_cache: boolean;
+	/** Path to model manifest file */
+	path: string;
+	/** Current status of the model */
+	status: ApiModelStatus;
+	/** Legacy meta field (may be present in older responses) */
+	meta?: Record<string, unknown> | null;
+}
+
+export interface ApiModelDetails {
+	name: string;
+	model: string;
+	modified_at?: string;
+	size?: string | number;
+	digest?: string;
+	type?: string;
+	description?: string;
+	tags?: string[];
+	capabilities?: string[];
+	parameters?: string;
+	details?: {
+		parent_model?: string;
+		format?: string;
+		family?: string;
+		families?: string[];
+		parameter_size?: string;
+		quantization_level?: string;
+	};
+}
+
+export interface ApiModelListResponse {
+	object: string;
+	data: ApiModelDataEntry[];
+	models?: ApiModelDetails[];
+}
+
+export interface ApiLlamaCppServerProps {
+	default_generation_settings: {
+		id: number;
+		id_task: number;
+		n_ctx: number;
+		speculative: boolean;
+		is_processing: boolean;
+		params: {
+			n_predict: number;
+			seed: number;
+			temperature: number;
+			dynatemp_range: number;
+			dynatemp_exponent: number;
+			top_k: number;
+			top_p: number;
+			min_p: number;
+			top_n_sigma: number;
+			xtc_probability: number;
+			xtc_threshold: number;
+			typ_p: number;
+			repeat_last_n: number;
+			repeat_penalty: number;
+			presence_penalty: number;
+			frequency_penalty: number;
+			dry_multiplier: number;
+			dry_base: number;
+			dry_allowed_length: number;
+			dry_penalty_last_n: number;
+			dry_sequence_breakers: string[];
+			mirostat: number;
+			mirostat_tau: number;
+			mirostat_eta: number;
+			stop: string[];
+			max_tokens: number;
+			n_keep: number;
+			n_discard: number;
+			ignore_eos: boolean;
+			stream: boolean;
+			logit_bias: Array<[number, number]>;
+			n_probs: number;
+			min_keep: number;
+			grammar: string;
+			grammar_lazy: boolean;
+			grammar_triggers: string[];
+			preserved_tokens: number[];
+			chat_format: string;
+			reasoning_format: string;
+			reasoning_in_content: boolean;
+			thinking_forced_open: boolean;
+			samplers: string[];
+			backend_sampling: boolean;
+			'speculative.n_max': number;
+			'speculative.n_min': number;
+			'speculative.p_min': number;
+			timings_per_token: boolean;
+			post_sampling_probs: boolean;
+			lora: Array<{ name: string; scale: number }>;
+		};
+		prompt: string;
+		next_token: {
+			has_next_token: boolean;
+			has_new_line: boolean;
+			n_remain: number;
+			n_decoded: number;
+			stopping_word: string;
+		};
+	};
+	total_slots: number;
+	model_path: string;
+	role: ServerRole;
+	modalities: {
+		vision: boolean;
+		audio: boolean;
+	};
+	chat_template: string;
+	bos_token: string;
+	eos_token: string;
+	build_info: string;
+	webui_settings?: Record<string, string | number | boolean>;
+}
+
+export interface ApiChatCompletionRequest {
+	messages: Array<{
+		role: ChatRole;
+		content: string | ApiChatMessageContentPart[];
+	}>;
+	stream?: boolean;
+	model?: string;
+	return_progress?: boolean;
+	// Reasoning parameters
+	reasoning_format?: string;
+	// Generation parameters
+	temperature?: number;
+	max_tokens?: number;
+	// Sampling parameters
+	dynatemp_range?: number;
+	dynatemp_exponent?: number;
+	top_k?: number;
+	top_p?: number;
+	min_p?: number;
+	xtc_probability?: number;
+	xtc_threshold?: number;
+	typ_p?: number;
+	// Penalty parameters
+	repeat_last_n?: number;
+	repeat_penalty?: number;
+	presence_penalty?: number;
+	frequency_penalty?: number;
+	dry_multiplier?: number;
+	dry_base?: number;
+	dry_allowed_length?: number;
+	dry_penalty_last_n?: number;
+	// Sampler configuration
+	samplers?: string[];
+	backend_sampling?: boolean;
+	// Custom parameters (JSON string)
+	custom?: Record<string, unknown>;
+	timings_per_token?: boolean;
+}
+
+export interface ApiChatCompletionToolCallFunctionDelta {
+	name?: string;
+	arguments?: string;
+}
+
+export interface ApiChatCompletionToolCallDelta {
+	index?: number;
+	id?: string;
+	type?: string;
+	function?: ApiChatCompletionToolCallFunctionDelta;
+}
+
+export interface ApiChatCompletionToolCall extends ApiChatCompletionToolCallDelta {
+	function?: ApiChatCompletionToolCallFunctionDelta & { arguments?: string };
+}
+
+export interface ApiChatCompletionStreamChunk {
+	object?: string;
+	model?: string;
+	choices: Array<{
+		model?: string;
+		metadata?: { model?: string };
+		delta: {
+			content?: string;
+			reasoning_content?: string;
+			model?: string;
+			tool_calls?: ApiChatCompletionToolCallDelta[];
+		};
+	}>;
+	timings?: {
+		prompt_n?: number;
+		prompt_ms?: number;
+		predicted_n?: number;
+		predicted_ms?: number;
+		cache_n?: number;
+	};
+	prompt_progress?: ChatMessagePromptProgress;
+}
+
+export interface ApiChatCompletionResponse {
+	model?: string;
+	choices: Array<{
+		model?: string;
+		metadata?: { model?: string };
+		message: {
+			content: string;
+			reasoning_content?: string;
+			model?: string;
+			tool_calls?: ApiChatCompletionToolCallDelta[];
+		};
+	}>;
+}
+
+export interface ApiSlotData {
+	id: number;
+	id_task: number;
+	n_ctx: number;
+	speculative: boolean;
+	is_processing: boolean;
+	params: {
+		n_predict: number;
+		seed: number;
+		temperature: number;
+		dynatemp_range: number;
+		dynatemp_exponent: number;
+		top_k: number;
+		top_p: number;
+		min_p: number;
+		top_n_sigma: number;
+		xtc_probability: number;
+		xtc_threshold: number;
+		typical_p: number;
+		repeat_last_n: number;
+		repeat_penalty: number;
+		presence_penalty: number;
+		frequency_penalty: number;
+		dry_multiplier: number;
+		dry_base: number;
+		dry_allowed_length: number;
+		dry_penalty_last_n: number;
+		mirostat: number;
+		mirostat_tau: number;
+		mirostat_eta: number;
+		max_tokens: number;
+		n_keep: number;
+		n_discard: number;
+		ignore_eos: boolean;
+		stream: boolean;
+		n_probs: number;
+		min_keep: number;
+		chat_format: string;
+		reasoning_format: string;
+		reasoning_in_content: boolean;
+		thinking_forced_open: boolean;
+		samplers: string[];
+		backend_sampling: boolean;
+		'speculative.n_max': number;
+		'speculative.n_min': number;
+		'speculative.p_min': number;
+		timings_per_token: boolean;
+		post_sampling_probs: boolean;
+		lora: Array<{ name: string; scale: number }>;
+	};
+	next_token: {
+		has_next_token: boolean;
+		has_new_line: boolean;
+		n_remain: number;
+		n_decoded: number;
+	};
+}
+
+export interface ApiProcessingState {
+	status: 'initializing' | 'generating' | 'preparing' | 'idle';
+	tokensDecoded: number;
+	tokensRemaining: number;
+	contextUsed: number;
+	contextTotal: number;
+	outputTokensUsed: number; // Total output tokens (thinking + regular content)
+	outputTokensMax: number; // Max output tokens allowed
+	temperature: number;
+	topP: number;
+	speculative: boolean;
+	hasNextToken: boolean;
+	tokensPerSecond?: number;
+	// Progress information from prompt_progress
+	progressPercent?: number;
+	promptProgress?: ChatMessagePromptProgress;
+	promptTokens?: number;
+	promptMs?: number;
+	cacheTokens?: number;
+}
+
+/**
+ * Router model metadata - extended from ApiModelDataEntry with additional router-specific fields
+ * @deprecated Use ApiModelDataEntry instead - the /models endpoint returns this structure directly
+ */
+export interface ApiRouterModelMeta {
+	/** Model identifier (e.g., "ggml-org/Qwen2.5-Omni-7B-GGUF:latest") */
+	name: string;
+	/** Path to model file or manifest */
+	path: string;
+	/** Optional path to multimodal projector */
+	path_mmproj?: string;
+	/** Whether model is in HuggingFace cache */
+	in_cache: boolean;
+	/** Port where model instance is running (0 if not loaded) */
+	port?: number;
+	/** Current status of the model */
+	status: ApiModelStatus;
+	/** Error message if status is FAILED */
+	error?: string;
+}
+
+/**
+ * Request to load a model
+ */
+export interface ApiRouterModelsLoadRequest {
+	model: string;
+}
+
+/**
+ * Response from loading a model
+ */
+export interface ApiRouterModelsLoadResponse {
+	success: boolean;
+	error?: string;
+}
+
+/**
+ * Request to check model status
+ */
+export interface ApiRouterModelsStatusRequest {
+	model: string;
+}
+
+/**
+ * Response with model status
+ */
+export interface ApiRouterModelsStatusResponse {
+	model: string;
+	status: ModelStatus;
+	port?: number;
+	error?: string;
+}
+
+/**
+ * Response with list of all models from /models endpoint
+ * Note: This is the same as ApiModelListResponse - the endpoint returns the same structure
+ * regardless of server mode (MODEL or ROUTER)
+ */
+export interface ApiRouterModelsListResponse {
+	object: string;
+	data: ApiModelDataEntry[];
+}
+
+/**
+ * Request to unload a model
+ */
+export interface ApiRouterModelsUnloadRequest {
+	model: string;
+}
+
+/**
+ * Response from unloading a model
+ */
+export interface ApiRouterModelsUnloadResponse {
+	success: boolean;
+	error?: string;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/types/chat.d.ts b/llama.cpp/tools/server/webui/src/lib/types/chat.d.ts
new file mode 100644
index 0000000..0e706b7
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/types/chat.d.ts
@@ -0,0 +1,55 @@
+export type ChatMessageType = 'root' | 'text' | 'think' | 'system';
+export type ChatRole = 'user' | 'assistant' | 'system';
+
+export interface ChatUploadedFile {
+	id: string;
+	name: string;
+	size: number;
+	type: string;
+	file: File;
+	preview?: string;
+	textContent?: string;
+}
+
+export interface ChatAttachmentDisplayItem {
+	id: string;
+	name: string;
+	size?: number;
+	preview?: string;
+	isImage: boolean;
+	uploadedFile?: ChatUploadedFile;
+	attachment?: DatabaseMessageExtra;
+	attachmentIndex?: number;
+	textContent?: string;
+}
+
+export interface ChatAttachmentPreviewItem {
+	uploadedFile?: ChatUploadedFile;
+	attachment?: DatabaseMessageExtra;
+	preview?: string;
+	name?: string;
+	size?: number;
+	textContent?: string;
+}
+
+export interface ChatMessageSiblingInfo {
+	message: DatabaseMessage;
+	siblingIds: string[];
+	currentIndex: number;
+	totalSiblings: number;
+}
+
+export interface ChatMessagePromptProgress {
+	cache: number;
+	processed: number;
+	time_ms: number;
+	total: number;
+}
+
+export interface ChatMessageTimings {
+	cache_n?: number;
+	predicted_ms?: number;
+	predicted_n?: number;
+	prompt_ms?: number;
+	prompt_n?: number;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/types/database.d.ts b/llama.cpp/tools/server/webui/src/lib/types/database.d.ts
new file mode 100644
index 0000000..1a336e0
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/types/database.d.ts
@@ -0,0 +1,85 @@
+import type { ChatMessageTimings, ChatRole, ChatMessageType } from '$lib/types/chat';
+import { AttachmentType } from '$lib/enums';
+
+export interface DatabaseConversation {
+	currNode: string | null;
+	id: string;
+	lastModified: number;
+	name: string;
+}
+
+export interface DatabaseMessageExtraAudioFile {
+	type: AttachmentType.AUDIO;
+	name: string;
+	base64Data: string;
+	mimeType: string;
+}
+
+export interface DatabaseMessageExtraImageFile {
+	type: AttachmentType.IMAGE;
+	name: string;
+	base64Url: string;
+}
+
+/**
+ * Legacy format from old webui - pasted content was stored as "context" type
+ * @deprecated Use DatabaseMessageExtraTextFile instead
+ */
+export interface DatabaseMessageExtraLegacyContext {
+	type: AttachmentType.LEGACY_CONTEXT;
+	name: string;
+	content: string;
+}
+
+export interface DatabaseMessageExtraPdfFile {
+	type: AttachmentType.PDF;
+	base64Data: string;
+	name: string;
+	content: string; // Text content extracted from PDF
+	images?: string[]; // Optional: PDF pages as base64 images
+	processedAsImages: boolean; // Whether PDF was processed as images
+}
+
+export interface DatabaseMessageExtraTextFile {
+	type: AttachmentType.TEXT;
+	name: string;
+	content: string;
+}
+
+export type DatabaseMessageExtra =
+	| DatabaseMessageExtraImageFile
+	| DatabaseMessageExtraTextFile
+	| DatabaseMessageExtraAudioFile
+	| DatabaseMessageExtraPdfFile
+	| DatabaseMessageExtraLegacyContext;
+
+export interface DatabaseMessage {
+	id: string;
+	convId: string;
+	type: ChatMessageType;
+	timestamp: number;
+	role: ChatRole;
+	content: string;
+	parent: string;
+	thinking: string;
+	toolCalls?: string;
+	children: string[];
+	extra?: DatabaseMessageExtra[];
+	timings?: ChatMessageTimings;
+	model?: string;
+}
+
+/**
+ * Represents a single conversation with its associated messages,
+ * typically used for import/export operations.
+ */
+export type ExportedConversation = {
+	conv: DatabaseConversation;
+	messages: DatabaseMessage[];
+};
+
+/**
+ * Type representing one or more exported conversations.
+ * Can be a single conversation object or an array of them.
+ */
+export type ExportedConversations = ExportedConversation | ExportedConversation[];
diff --git a/llama.cpp/tools/server/webui/src/lib/types/index.ts b/llama.cpp/tools/server/webui/src/lib/types/index.ts
new file mode 100644
index 0000000..2a21c6d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/types/index.ts
@@ -0,0 +1,70 @@
+/**
+ * Unified exports for all type definitions
+ * Import types from '$lib/types' for cleaner imports
+ */
+
+// API types
+export type {
+	ApiChatMessageContentPart,
+	ApiContextSizeError,
+	ApiErrorResponse,
+	ApiChatMessageData,
+	ApiModelStatus,
+	ApiModelDataEntry,
+	ApiModelDetails,
+	ApiModelListResponse,
+	ApiLlamaCppServerProps,
+	ApiChatCompletionRequest,
+	ApiChatCompletionToolCallFunctionDelta,
+	ApiChatCompletionToolCallDelta,
+	ApiChatCompletionToolCall,
+	ApiChatCompletionStreamChunk,
+	ApiChatCompletionResponse,
+	ApiSlotData,
+	ApiProcessingState,
+	ApiRouterModelMeta,
+	ApiRouterModelsLoadRequest,
+	ApiRouterModelsLoadResponse,
+	ApiRouterModelsStatusRequest,
+	ApiRouterModelsStatusResponse,
+	ApiRouterModelsListResponse,
+	ApiRouterModelsUnloadRequest,
+	ApiRouterModelsUnloadResponse
+} from './api';
+
+// Chat types
+export type {
+	ChatMessageType,
+	ChatRole,
+	ChatUploadedFile,
+	ChatAttachmentDisplayItem,
+	ChatAttachmentPreviewItem,
+	ChatMessageSiblingInfo,
+	ChatMessagePromptProgress,
+	ChatMessageTimings
+} from './chat';
+
+// Database types
+export type {
+	DatabaseConversation,
+	DatabaseMessageExtraAudioFile,
+	DatabaseMessageExtraImageFile,
+	DatabaseMessageExtraLegacyContext,
+	DatabaseMessageExtraPdfFile,
+	DatabaseMessageExtraTextFile,
+	DatabaseMessageExtra,
+	DatabaseMessage,
+	ExportedConversation,
+	ExportedConversations
+} from './database';
+
+// Model types
+export type { ModelModalities, ModelOption } from './models';
+
+// Settings types
+export type {
+	SettingsConfigValue,
+	SettingsFieldConfig,
+	SettingsChatServiceOptions,
+	SettingsConfigType
+} from './settings';
diff --git a/llama.cpp/tools/server/webui/src/lib/types/models.d.ts b/llama.cpp/tools/server/webui/src/lib/types/models.d.ts
new file mode 100644
index 0000000..ef44a2c
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/types/models.d.ts
@@ -0,0 +1,21 @@
+import type { ApiModelDataEntry, ApiModelDetails } from '$lib/types/api';
+
+/**
+ * Model modalities - vision and audio capabilities
+ */
+export interface ModelModalities {
+	vision: boolean;
+	audio: boolean;
+}
+
+export interface ModelOption {
+	id: string;
+	name: string;
+	model: string;
+	description?: string;
+	capabilities: string[];
+	/** Model modalities from /props endpoint */
+	modalities?: ModelModalities;
+	details?: ApiModelDetails['details'];
+	meta?: ApiModelDataEntry['meta'];
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts b/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts
new file mode 100644
index 0000000..38b3047
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts
@@ -0,0 +1,67 @@
+import type { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
+import type { ChatMessageTimings } from './chat';
+
+export type SettingsConfigValue = string | number | boolean;
+
+export interface SettingsFieldConfig {
+	key: string;
+	label: string;
+	type: 'input' | 'textarea' | 'checkbox' | 'select';
+	isExperimental?: boolean;
+	help?: string;
+	options?: Array<{ value: string; label: string; icon?: typeof import('@lucide/svelte').Icon }>;
+}
+
+export interface SettingsChatServiceOptions {
+	stream?: boolean;
+	// Model (required in ROUTER mode, optional in MODEL mode)
+	model?: string;
+	// System message to inject
+	systemMessage?: string;
+	// Disable reasoning format (use 'none' instead of 'auto')
+	disableReasoningFormat?: boolean;
+	// Generation parameters
+	temperature?: number;
+	max_tokens?: number;
+	// Sampling parameters
+	dynatemp_range?: number;
+	dynatemp_exponent?: number;
+	top_k?: number;
+	top_p?: number;
+	min_p?: number;
+	xtc_probability?: number;
+	xtc_threshold?: number;
+	typ_p?: number;
+	// Penalty parameters
+	repeat_last_n?: number;
+	repeat_penalty?: number;
+	presence_penalty?: number;
+	frequency_penalty?: number;
+	dry_multiplier?: number;
+	dry_base?: number;
+	dry_allowed_length?: number;
+	dry_penalty_last_n?: number;
+	// Sampler configuration
+	samplers?: string | string[];
+	backend_sampling?: boolean;
+	// Custom parameters
+	custom?: string;
+	timings_per_token?: boolean;
+	// Callbacks
+	onChunk?: (chunk: string) => void;
+	onReasoningChunk?: (chunk: string) => void;
+	onToolCallChunk?: (chunk: string) => void;
+	onModel?: (model: string) => void;
+	onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
+	onComplete?: (
+		response: string,
+		reasoningContent?: string,
+		timings?: ChatMessageTimings,
+		toolCalls?: string
+	) => void;
+	onError?: (error: Error) => void;
+}
+
+export type SettingsConfigType = typeof SETTING_CONFIG_DEFAULT & {
+	[key: string]: SettingsConfigValue;
+};
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/api-headers.ts b/llama.cpp/tools/server/webui/src/lib/utils/api-headers.ts
new file mode 100644
index 0000000..77ce3e8
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/api-headers.ts
@@ -0,0 +1,22 @@
+import { config } from '$lib/stores/settings.svelte';
+
+/**
+ * Get authorization headers for API requests
+ * Includes Bearer token if API key is configured
+ */
+export function getAuthHeaders(): Record<string, string> {
+	const currentConfig = config();
+	const apiKey = currentConfig.apiKey?.toString().trim();
+
+	return apiKey ? { Authorization: `Bearer ${apiKey}` } : {};
+}
+
+/**
+ * Get standard JSON headers with optional authorization
+ */
+export function getJsonHeaders(): Record<string, string> {
+	return {
+		'Content-Type': 'application/json',
+		...getAuthHeaders()
+	};
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/api-key-validation.ts b/llama.cpp/tools/server/webui/src/lib/utils/api-key-validation.ts
new file mode 100644
index 0000000..948b7d7
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/api-key-validation.ts
@@ -0,0 +1,45 @@
+import { base } from '$app/paths';
+import { error } from '@sveltejs/kit';
+import { browser } from '$app/environment';
+import { config } from '$lib/stores/settings.svelte';
+
+/**
+ * Validates API key by making a request to the server props endpoint
+ * Throws SvelteKit errors for authentication failures or server issues
+ */
+export async function validateApiKey(fetch: typeof globalThis.fetch): Promise<void> {
+	if (!browser) {
+		return;
+	}
+
+	try {
+		const apiKey = config().apiKey;
+
+		const headers: Record<string, string> = {
+			'Content-Type': 'application/json'
+		};
+
+		if (apiKey) {
+			headers.Authorization = `Bearer ${apiKey}`;
+		}
+
+		const response = await fetch(`${base}/props`, { headers });
+
+		if (!response.ok) {
+			if (response.status === 401 || response.status === 403) {
+				throw error(401, 'Access denied');
+			}
+
+			console.warn(`Server responded with status ${response.status} during API key validation`);
+			return;
+		}
+	} catch (err) {
+		// If it's already a SvelteKit error, re-throw it
+		if (err && typeof err === 'object' && 'status' in err) {
+			throw err;
+		}
+
+		// Network or other errors
+		console.warn('Cannot connect to server for API key validation:', err);
+	}
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/attachment-display.ts b/llama.cpp/tools/server/webui/src/lib/utils/attachment-display.ts
new file mode 100644
index 0000000..750aaa3
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/attachment-display.ts
@@ -0,0 +1,61 @@
+import { FileTypeCategory } from '$lib/enums';
+import { getFileTypeCategory, getFileTypeCategoryByExtension, isImageFile } from '$lib/utils';
+
+export interface AttachmentDisplayItemsOptions {
+	uploadedFiles?: ChatUploadedFile[];
+	attachments?: DatabaseMessageExtra[];
+}
+
+/**
+ * Gets the file type category from an uploaded file, checking both MIME type and extension
+ */
+function getUploadedFileCategory(file: ChatUploadedFile): FileTypeCategory | null {
+	const categoryByMime = getFileTypeCategory(file.type);
+
+	if (categoryByMime) {
+		return categoryByMime;
+	}
+
+	return getFileTypeCategoryByExtension(file.name);
+}
+
+/**
+ * Creates a unified list of display items from uploaded files and stored attachments.
+ * Items are returned in reverse order (newest first).
+ */
+export function getAttachmentDisplayItems(
+	options: AttachmentDisplayItemsOptions
+): ChatAttachmentDisplayItem[] {
+	const { uploadedFiles = [], attachments = [] } = options;
+	const items: ChatAttachmentDisplayItem[] = [];
+
+	// Add uploaded files (ChatForm)
+	for (const file of uploadedFiles) {
+		items.push({
+			id: file.id,
+			name: file.name,
+			size: file.size,
+			preview: file.preview,
+			isImage: getUploadedFileCategory(file) === FileTypeCategory.IMAGE,
+			uploadedFile: file,
+			textContent: file.textContent
+		});
+	}
+
+	// Add stored attachments (ChatMessage)
+	for (const [index, attachment] of attachments.entries()) {
+		const isImage = isImageFile(attachment);
+
+		items.push({
+			id: `attachment-${index}`,
+			name: attachment.name,
+			preview: isImage && 'base64Url' in attachment ? attachment.base64Url : undefined,
+			isImage,
+			attachment,
+			attachmentIndex: index,
+			textContent: 'content' in attachment ? attachment.content : undefined
+		});
+	}
+
+	return items.reverse();
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/attachment-type.ts b/llama.cpp/tools/server/webui/src/lib/utils/attachment-type.ts
new file mode 100644
index 0000000..9e9f096
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/attachment-type.ts
@@ -0,0 +1,105 @@
+import { AttachmentType, FileTypeCategory } from '$lib/enums';
+import { getFileTypeCategory, getFileTypeCategoryByExtension } from '$lib/utils';
+
+/**
+ * Gets the file type category from an uploaded file, checking both MIME type and extension
+ * @param uploadedFile - The uploaded file to check
+ * @returns The file type category or null if not recognized
+ */
+function getUploadedFileCategory(uploadedFile: ChatUploadedFile): FileTypeCategory | null {
+	// First try MIME type
+	const categoryByMime = getFileTypeCategory(uploadedFile.type);
+
+	if (categoryByMime) {
+		return categoryByMime;
+	}
+
+	// Fallback to extension (browsers don't always provide correct MIME types)
+	return getFileTypeCategoryByExtension(uploadedFile.name);
+}
+
+/**
+ * Determines if an attachment or uploaded file is a text file
+ * @param uploadedFile - Optional uploaded file
+ * @param attachment - Optional database attachment
+ * @returns true if the file is a text file
+ */
+export function isTextFile(
+	attachment?: DatabaseMessageExtra,
+	uploadedFile?: ChatUploadedFile
+): boolean {
+	if (uploadedFile) {
+		return getUploadedFileCategory(uploadedFile) === FileTypeCategory.TEXT;
+	}
+
+	if (attachment) {
+		return (
+			attachment.type === AttachmentType.TEXT || attachment.type === AttachmentType.LEGACY_CONTEXT
+		);
+	}
+
+	return false;
+}
+
+/**
+ * Determines if an attachment or uploaded file is an image
+ * @param uploadedFile - Optional uploaded file
+ * @param attachment - Optional database attachment
+ * @returns true if the file is an image
+ */
+export function isImageFile(
+	attachment?: DatabaseMessageExtra,
+	uploadedFile?: ChatUploadedFile
+): boolean {
+	if (uploadedFile) {
+		return getUploadedFileCategory(uploadedFile) === FileTypeCategory.IMAGE;
+	}
+
+	if (attachment) {
+		return attachment.type === AttachmentType.IMAGE;
+	}
+
+	return false;
+}
+
+/**
+ * Determines if an attachment or uploaded file is a PDF
+ * @param uploadedFile - Optional uploaded file
+ * @param attachment - Optional database attachment
+ * @returns true if the file is a PDF
+ */
+export function isPdfFile(
+	attachment?: DatabaseMessageExtra,
+	uploadedFile?: ChatUploadedFile
+): boolean {
+	if (uploadedFile) {
+		return getUploadedFileCategory(uploadedFile) === FileTypeCategory.PDF;
+	}
+
+	if (attachment) {
+		return attachment.type === AttachmentType.PDF;
+	}
+
+	return false;
+}
+
+/**
+ * Determines if an attachment or uploaded file is an audio file
+ * @param uploadedFile - Optional uploaded file
+ * @param attachment - Optional database attachment
+ * @returns true if the file is an audio file
+ */
+export function isAudioFile(
+	attachment?: DatabaseMessageExtra,
+	uploadedFile?: ChatUploadedFile
+): boolean {
+	if (uploadedFile) {
+		return getUploadedFileCategory(uploadedFile) === FileTypeCategory.AUDIO;
+	}
+
+	if (attachment) {
+		return attachment.type === AttachmentType.AUDIO;
+	}
+
+	return false;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/audio-recording.ts b/llama.cpp/tools/server/webui/src/lib/utils/audio-recording.ts
new file mode 100644
index 0000000..2a21985
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/audio-recording.ts
@@ -0,0 +1,226 @@
+import { MimeTypeAudio } from '$lib/enums';
+
+/**
+ * AudioRecorder - Browser-based audio recording with MediaRecorder API
+ *
+ * This class provides a complete audio recording solution using the browser's MediaRecorder API.
+ * It handles microphone access, recording state management, and audio format optimization.
+ *
+ * **Features:**
+ * - Automatic microphone permission handling
+ * - Audio enhancement (echo cancellation, noise suppression, auto gain)
+ * - Multiple format support with fallback (WAV, WebM, MP4, AAC)
+ * - Real-time recording state tracking
+ * - Proper cleanup and resource management
+ */
+export class AudioRecorder {
+	private mediaRecorder: MediaRecorder | null = null;
+	private audioChunks: Blob[] = [];
+	private stream: MediaStream | null = null;
+	private recordingState: boolean = false;
+
+	async startRecording(): Promise<void> {
+		try {
+			this.stream = await navigator.mediaDevices.getUserMedia({
+				audio: {
+					echoCancellation: true,
+					noiseSuppression: true,
+					autoGainControl: true
+				}
+			});
+
+			this.initializeRecorder(this.stream);
+
+			this.audioChunks = [];
+			// Start recording with a small timeslice to ensure we get data
+			this.mediaRecorder!.start(100);
+			this.recordingState = true;
+		} catch (error) {
+			console.error('Failed to start recording:', error);
+			throw new Error('Failed to access microphone. Please check permissions.');
+		}
+	}
+
+	async stopRecording(): Promise<Blob> {
+		return new Promise((resolve, reject) => {
+			if (!this.mediaRecorder || this.mediaRecorder.state === 'inactive') {
+				reject(new Error('No active recording to stop'));
+				return;
+			}
+
+			this.mediaRecorder.onstop = () => {
+				const mimeType = this.mediaRecorder?.mimeType || MimeTypeAudio.WAV;
+				const audioBlob = new Blob(this.audioChunks, { type: mimeType });
+
+				this.cleanup();
+
+				resolve(audioBlob);
+			};
+
+			this.mediaRecorder.onerror = (event) => {
+				console.error('Recording error:', event);
+				this.cleanup();
+				reject(new Error('Recording failed'));
+			};
+
+			this.mediaRecorder.stop();
+		});
+	}
+
+	isRecording(): boolean {
+		return this.recordingState;
+	}
+
+	cancelRecording(): void {
+		if (this.mediaRecorder && this.mediaRecorder.state !== 'inactive') {
+			this.mediaRecorder.stop();
+		}
+		this.cleanup();
+	}
+
+	private initializeRecorder(stream: MediaStream): void {
+		const options: MediaRecorderOptions = {};
+
+		if (MediaRecorder.isTypeSupported(MimeTypeAudio.WAV)) {
+			options.mimeType = MimeTypeAudio.WAV;
+		} else if (MediaRecorder.isTypeSupported(MimeTypeAudio.WEBM_OPUS)) {
+			options.mimeType = MimeTypeAudio.WEBM_OPUS;
+		} else if (MediaRecorder.isTypeSupported(MimeTypeAudio.WEBM)) {
+			options.mimeType = MimeTypeAudio.WEBM;
+		} else if (MediaRecorder.isTypeSupported(MimeTypeAudio.MP4)) {
+			options.mimeType = MimeTypeAudio.MP4;
+		} else {
+			console.warn('No preferred audio format supported, using default');
+		}
+
+		this.mediaRecorder = new MediaRecorder(stream, options);
+
+		this.mediaRecorder.ondataavailable = (event) => {
+			if (event.data.size > 0) {
+				this.audioChunks.push(event.data);
+			}
+		};
+
+		this.mediaRecorder.onstop = () => {
+			this.recordingState = false;
+		};
+
+		this.mediaRecorder.onerror = (event) => {
+			console.error('MediaRecorder error:', event);
+			this.recordingState = false;
+		};
+	}
+
+	private cleanup(): void {
+		if (this.stream) {
+			for (const track of this.stream.getTracks()) {
+				track.stop();
+			}
+
+			this.stream = null;
+		}
+		this.mediaRecorder = null;
+		this.audioChunks = [];
+		this.recordingState = false;
+	}
+}
+
+export async function convertToWav(audioBlob: Blob): Promise<Blob> {
+	try {
+		if (audioBlob.type.includes('wav')) {
+			return audioBlob;
+		}
+
+		const arrayBuffer = await audioBlob.arrayBuffer();
+
+		// eslint-disable-next-line @typescript-eslint/no-explicit-any
+		const audioContext = new (window.AudioContext || (window as any).webkitAudioContext)();
+
+		const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
+
+		const wavBlob = audioBufferToWav(audioBuffer);
+
+		audioContext.close();
+
+		return wavBlob;
+	} catch (error) {
+		console.error('Failed to convert audio to WAV:', error);
+		return audioBlob;
+	}
+}
+
+function audioBufferToWav(buffer: AudioBuffer): Blob {
+	const length = buffer.length;
+	const numberOfChannels = buffer.numberOfChannels;
+	const sampleRate = buffer.sampleRate;
+	const bytesPerSample = 2; // 16-bit
+	const blockAlign = numberOfChannels * bytesPerSample;
+	const byteRate = sampleRate * blockAlign;
+	const dataSize = length * blockAlign;
+	const bufferSize = 44 + dataSize;
+
+	const arrayBuffer = new ArrayBuffer(bufferSize);
+	const view = new DataView(arrayBuffer);
+
+	const writeString = (offset: number, string: string) => {
+		for (let i = 0; i < string.length; i++) {
+			view.setUint8(offset + i, string.charCodeAt(i));
+		}
+	};
+
+	writeString(0, 'RIFF'); // ChunkID
+	view.setUint32(4, bufferSize - 8, true); // ChunkSize
+	writeString(8, 'WAVE'); // Format
+	writeString(12, 'fmt '); // Subchunk1ID
+	view.setUint32(16, 16, true); // Subchunk1Size
+	view.setUint16(20, 1, true); // AudioFormat (PCM)
+	view.setUint16(22, numberOfChannels, true); // NumChannels
+	view.setUint32(24, sampleRate, true); // SampleRate
+	view.setUint32(28, byteRate, true); // ByteRate
+	view.setUint16(32, blockAlign, true); // BlockAlign
+	view.setUint16(34, 16, true); // BitsPerSample
+	writeString(36, 'data'); // Subchunk2ID
+	view.setUint32(40, dataSize, true); // Subchunk2Size
+
+	let offset = 44;
+	for (let i = 0; i < length; i++) {
+		for (let channel = 0; channel < numberOfChannels; channel++) {
+			const sample = Math.max(-1, Math.min(1, buffer.getChannelData(channel)[i]));
+			view.setInt16(offset, sample * 0x7fff, true);
+			offset += 2;
+		}
+	}
+
+	return new Blob([arrayBuffer], { type: MimeTypeAudio.WAV });
+}
+
+/**
+ * Create a File object from audio blob with timestamp-based naming
+ * @param audioBlob - The audio blob to wrap
+ * @param filename - Optional custom filename
+ * @returns File object with appropriate name and metadata
+ */
+export function createAudioFile(audioBlob: Blob, filename?: string): File {
+	const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
+	const extension = audioBlob.type.includes('wav') ? 'wav' : 'mp3';
+	const defaultFilename = `recording-${timestamp}.${extension}`;
+
+	return new File([audioBlob], filename || defaultFilename, {
+		type: audioBlob.type,
+		lastModified: Date.now()
+	});
+}
+
+/**
+ * Check if audio recording is supported in the current browser
+ * @returns True if MediaRecorder and getUserMedia are available
+ */
+export function isAudioRecordingSupported(): boolean {
+	return !!(
+		typeof navigator !== 'undefined' &&
+		navigator.mediaDevices &&
+		typeof navigator.mediaDevices.getUserMedia === 'function' &&
+		typeof window !== 'undefined' &&
+		window.MediaRecorder
+	);
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/autoresize-textarea.ts b/llama.cpp/tools/server/webui/src/lib/utils/autoresize-textarea.ts
new file mode 100644
index 0000000..cfee5ec
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/autoresize-textarea.ts
@@ -0,0 +1,10 @@
+/**
+ * Automatically resizes a textarea element to fit its content
+ * @param textareaElement - The textarea element to resize
+ */
+export default function autoResizeTextarea(textareaElement: HTMLTextAreaElement | null): void {
+	if (textareaElement) {
+		textareaElement.style.height = '1rem';
+		textareaElement.style.height = textareaElement.scrollHeight + 'px';
+	}
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/branching.ts b/llama.cpp/tools/server/webui/src/lib/utils/branching.ts
new file mode 100644
index 0000000..3be5604
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/branching.ts
@@ -0,0 +1,283 @@
+/**
+ * Message branching utilities for conversation tree navigation.
+ *
+ * Conversation branching allows users to edit messages and create alternate paths
+ * while preserving the original conversation flow. Each message has parent/children
+ * relationships forming a tree structure.
+ *
+ * Example tree:
+ * root
+ *  ├── message 1 (user)
+ *  │      └── message 2 (assistant)
+ *  │             ├── message 3 (user)
+ *  │             └── message 6 (user) ← new branch
+ *  └── message 4 (user)
+ *        └── message 5 (assistant)
+ */
+
+/**
+ * Filters messages to get the conversation path from root to a specific leaf node.
+ * If the leafNodeId doesn't exist, returns the path with the latest timestamp.
+ *
+ * @param messages - All messages in the conversation
+ * @param leafNodeId - The target leaf node ID to trace back from
+ * @param includeRoot - Whether to include root messages in the result
+ * @returns Array of messages from root to leaf, sorted by timestamp
+ */
+export function filterByLeafNodeId(
+	messages: readonly DatabaseMessage[],
+	leafNodeId: string,
+	includeRoot: boolean = false
+): readonly DatabaseMessage[] {
+	const result: DatabaseMessage[] = [];
+	const nodeMap = new Map<string, DatabaseMessage>();
+
+	// Build node map for quick lookups
+	for (const msg of messages) {
+		nodeMap.set(msg.id, msg);
+	}
+
+	// Find the starting node (leaf node or latest if not found)
+	let startNode: DatabaseMessage | undefined = nodeMap.get(leafNodeId);
+	if (!startNode) {
+		// If leaf node not found, use the message with latest timestamp
+		let latestTime = -1;
+		for (const msg of messages) {
+			if (msg.timestamp > latestTime) {
+				startNode = msg;
+				latestTime = msg.timestamp;
+			}
+		}
+	}
+
+	// Traverse from leaf to root, collecting messages
+	let currentNode: DatabaseMessage | undefined = startNode;
+	while (currentNode) {
+		// Include message if it's not root, or if we want to include root
+		if (currentNode.type !== 'root' || includeRoot) {
+			result.push(currentNode);
+		}
+
+		// Stop traversal if parent is null (reached root)
+		if (currentNode.parent === null) {
+			break;
+		}
+		currentNode = nodeMap.get(currentNode.parent);
+	}
+
+	// Sort by timestamp to get chronological order (root to leaf)
+	result.sort((a, b) => a.timestamp - b.timestamp);
+	return result;
+}
+
+/**
+ * Finds the leaf node (message with no children) for a given message branch.
+ * Traverses down the tree following the last child until reaching a leaf.
+ *
+ * @param messages - All messages in the conversation
+ * @param messageId - Starting message ID to find leaf for
+ * @returns The leaf node ID, or the original messageId if no children
+ */
+export function findLeafNode(messages: readonly DatabaseMessage[], messageId: string): string {
+	const nodeMap = new Map<string, DatabaseMessage>();
+
+	// Build node map for quick lookups
+	for (const msg of messages) {
+		nodeMap.set(msg.id, msg);
+	}
+
+	let currentNode: DatabaseMessage | undefined = nodeMap.get(messageId);
+	while (currentNode && currentNode.children.length > 0) {
+		// Follow the last child (most recent branch)
+		const lastChildId = currentNode.children[currentNode.children.length - 1];
+		currentNode = nodeMap.get(lastChildId);
+	}
+
+	return currentNode?.id ?? messageId;
+}
+
+/**
+ * Finds all descendant messages (children, grandchildren, etc.) of a given message.
+ * This is used for cascading deletion to remove all messages in a branch.
+ *
+ * @param messages - All messages in the conversation
+ * @param messageId - The root message ID to find descendants for
+ * @returns Array of all descendant message IDs
+ */
+export function findDescendantMessages(
+	messages: readonly DatabaseMessage[],
+	messageId: string
+): string[] {
+	const nodeMap = new Map<string, DatabaseMessage>();
+
+	// Build node map for quick lookups
+	for (const msg of messages) {
+		nodeMap.set(msg.id, msg);
+	}
+
+	const descendants: string[] = [];
+	const queue: string[] = [messageId];
+
+	while (queue.length > 0) {
+		const currentId = queue.shift()!;
+		const currentNode = nodeMap.get(currentId);
+
+		if (currentNode) {
+			// Add all children to the queue and descendants list
+			for (const childId of currentNode.children) {
+				descendants.push(childId);
+				queue.push(childId);
+			}
+		}
+	}
+
+	return descendants;
+}
+
+/**
+ * Gets sibling information for a message, including all sibling IDs and current position.
+ * Siblings are messages that share the same parent.
+ *
+ * @param messages - All messages in the conversation
+ * @param messageId - The message to get sibling info for
+ * @returns Sibling information including leaf node IDs for navigation
+ */
+export function getMessageSiblings(
+	messages: readonly DatabaseMessage[],
+	messageId: string
+): ChatMessageSiblingInfo | null {
+	const nodeMap = new Map<string, DatabaseMessage>();
+
+	// Build node map for quick lookups
+	for (const msg of messages) {
+		nodeMap.set(msg.id, msg);
+	}
+
+	const message = nodeMap.get(messageId);
+	if (!message) {
+		return null;
+	}
+
+	// Handle null parent (root message) case
+	if (message.parent === null) {
+		// No parent means this is likely a root node with no siblings
+		return {
+			message,
+			siblingIds: [messageId],
+			currentIndex: 0,
+			totalSiblings: 1
+		};
+	}
+
+	const parentNode = nodeMap.get(message.parent);
+	if (!parentNode) {
+		// Parent not found - treat as single message
+		return {
+			message,
+			siblingIds: [messageId],
+			currentIndex: 0,
+			totalSiblings: 1
+		};
+	}
+
+	// Get all sibling IDs (including self)
+	const siblingIds = parentNode.children;
+
+	// Convert sibling message IDs to their corresponding leaf node IDs
+	// This allows navigation between different conversation branches
+	const siblingLeafIds = siblingIds.map((siblingId: string) => findLeafNode(messages, siblingId));
+
+	// Find current message's position among siblings
+	const currentIndex = siblingIds.indexOf(messageId);
+
+	return {
+		message,
+		siblingIds: siblingLeafIds,
+		currentIndex,
+		totalSiblings: siblingIds.length
+	};
+}
+
+/**
+ * Creates a display-ready list of messages with sibling information for UI rendering.
+ * This is the main function used by chat components to render conversation branches.
+ *
+ * @param messages - All messages in the conversation
+ * @param leafNodeId - Current leaf node being viewed
+ * @returns Array of messages with sibling navigation info
+ */
+export function getMessageDisplayList(
+	messages: readonly DatabaseMessage[],
+	leafNodeId: string
+): ChatMessageSiblingInfo[] {
+	// Get the current conversation path
+	const currentPath = filterByLeafNodeId(messages, leafNodeId, true);
+	const result: ChatMessageSiblingInfo[] = [];
+
+	// Add sibling info for each message in the current path
+	for (const message of currentPath) {
+		if (message.type === 'root') {
+			continue; // Skip root messages in display
+		}
+
+		const siblingInfo = getMessageSiblings(messages, message.id);
+		if (siblingInfo) {
+			result.push(siblingInfo);
+		}
+	}
+
+	return result;
+}
+
+/**
+ * Checks if a message has multiple siblings (indicating branching at that point).
+ *
+ * @param messages - All messages in the conversation
+ * @param messageId - The message to check
+ * @returns True if the message has siblings
+ */
+export function hasMessageSiblings(
+	messages: readonly DatabaseMessage[],
+	messageId: string
+): boolean {
+	const siblingInfo = getMessageSiblings(messages, messageId);
+	return siblingInfo ? siblingInfo.totalSiblings > 1 : false;
+}
+
+/**
+ * Gets the next sibling message ID for navigation.
+ *
+ * @param messages - All messages in the conversation
+ * @param messageId - Current message ID
+ * @returns Next sibling's leaf node ID, or null if at the end
+ */
+export function getNextSibling(
+	messages: readonly DatabaseMessage[],
+	messageId: string
+): string | null {
+	const siblingInfo = getMessageSiblings(messages, messageId);
+	if (!siblingInfo || siblingInfo.currentIndex >= siblingInfo.totalSiblings - 1) {
+		return null;
+	}
+
+	return siblingInfo.siblingIds[siblingInfo.currentIndex + 1];
+}
+
+/**
+ * Gets the previous sibling message ID for navigation.
+ *
+ * @param messages - All messages in the conversation
+ * @param messageId - Current message ID
+ * @returns Previous sibling's leaf node ID, or null if at the beginning
+ */
+export function getPreviousSibling(
+	messages: readonly DatabaseMessage[],
+	messageId: string
+): string | null {
+	const siblingInfo = getMessageSiblings(messages, messageId);
+	if (!siblingInfo || siblingInfo.currentIndex <= 0) {
+		return null;
+	}
+
+	return siblingInfo.siblingIds[siblingInfo.currentIndex - 1];
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/browser-only.ts b/llama.cpp/tools/server/webui/src/lib/utils/browser-only.ts
new file mode 100644
index 0000000..0af8006
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/browser-only.ts
@@ -0,0 +1,35 @@
+/**
+ * Browser-only utility exports
+ *
+ * These utilities require browser APIs (DOM, Canvas, MediaRecorder, etc.)
+ * and cannot be imported during SSR. Import from '$lib/utils/browser-only'
+ * only in client-side code or components that are not server-rendered.
+ */
+
+// Audio utilities (MediaRecorder API)
+export {
+	AudioRecorder,
+	convertToWav,
+	createAudioFile,
+	isAudioRecordingSupported
+} from './audio-recording';
+
+// PDF processing utilities (pdfjs-dist with DOMMatrix)
+export {
+	convertPDFToText,
+	convertPDFToImage,
+	isPdfFile as isPdfFileFromFile,
+	isApplicationMimeType
+} from './pdf-processing';
+
+// File conversion utilities (depends on pdf-processing)
+export { parseFilesToMessageExtras, type FileProcessingResult } from './convert-files-to-extra';
+
+// File upload processing utilities (depends on pdf-processing, svg-to-png, webp-to-png)
+export { processFilesToChatUploaded } from './process-uploaded-files';
+
+// SVG utilities (Canvas/Image API)
+export { svgBase64UrlToPngDataURL, isSvgFile, isSvgMimeType } from './svg-to-png';
+
+// WebP utilities (Canvas/Image API)
+export { webpBase64UrlToPngDataURL, isWebpFile, isWebpMimeType } from './webp-to-png';
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts b/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts
new file mode 100644
index 0000000..940e64c
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts
@@ -0,0 +1,259 @@
+import { toast } from 'svelte-sonner';
+import { AttachmentType } from '$lib/enums';
+import type {
+	DatabaseMessageExtra,
+	DatabaseMessageExtraTextFile,
+	DatabaseMessageExtraLegacyContext
+} from '$lib/types/database';
+
+/**
+ * Copy text to clipboard with toast notification
+ * Uses modern clipboard API when available, falls back to legacy method for non-secure contexts
+ * @param text - Text to copy to clipboard
+ * @param successMessage - Custom success message (optional)
+ * @param errorMessage - Custom error message (optional)
+ * @returns Promise<boolean> - True if successful, false otherwise
+ */
+export async function copyToClipboard(
+	text: string,
+	successMessage = 'Copied to clipboard',
+	errorMessage = 'Failed to copy to clipboard'
+): Promise<boolean> {
+	try {
+		// Try modern clipboard API first (secure contexts only)
+		if (navigator.clipboard && navigator.clipboard.writeText) {
+			await navigator.clipboard.writeText(text);
+			toast.success(successMessage);
+			return true;
+		}
+
+		// Fallback for non-secure contexts
+		const textArea = document.createElement('textarea');
+		textArea.value = text;
+		textArea.style.position = 'fixed';
+		textArea.style.left = '-999999px';
+		textArea.style.top = '-999999px';
+		document.body.appendChild(textArea);
+		textArea.focus();
+		textArea.select();
+
+		const successful = document.execCommand('copy');
+		document.body.removeChild(textArea);
+
+		if (successful) {
+			toast.success(successMessage);
+			return true;
+		} else {
+			throw new Error('execCommand failed');
+		}
+	} catch (error) {
+		console.error('Failed to copy to clipboard:', error);
+		toast.error(errorMessage);
+		return false;
+	}
+}
+
+/**
+ * Copy code with HTML entity decoding and toast notification
+ * @param rawCode - Raw code string that may contain HTML entities
+ * @param successMessage - Custom success message (optional)
+ * @param errorMessage - Custom error message (optional)
+ * @returns Promise<boolean> - True if successful, false otherwise
+ */
+export async function copyCodeToClipboard(
+	rawCode: string,
+	successMessage = 'Code copied to clipboard',
+	errorMessage = 'Failed to copy code'
+): Promise<boolean> {
+	return copyToClipboard(rawCode, successMessage, errorMessage);
+}
+
+/**
+ * Format for text attachments when copied to clipboard
+ */
+export interface ClipboardTextAttachment {
+	type: typeof AttachmentType.TEXT;
+	name: string;
+	content: string;
+}
+
+/**
+ * Parsed result from clipboard content
+ */
+export interface ParsedClipboardContent {
+	message: string;
+	textAttachments: ClipboardTextAttachment[];
+}
+
+/**
+ * Formats a message with text attachments for clipboard copying.
+ *
+ * Default format (asPlainText = false):
+ * ```
+ * "Text message content"
+ * [
+ *   {"type":"TEXT","name":"filename.txt","content":"..."},
+ *   {"type":"TEXT","name":"another.txt","content":"..."}
+ * ]
+ * ```
+ *
+ * Plain text format (asPlainText = true):
+ * ```
+ * Text message content
+ *
+ * file content here
+ *
+ * another file content
+ * ```
+ *
+ * @param content - The message text content
+ * @param extras - Optional array of message attachments
+ * @param asPlainText - If true, format as plain text without JSON structure
+ * @returns Formatted string for clipboard
+ */
+export function formatMessageForClipboard(
+	content: string,
+	extras?: DatabaseMessageExtra[],
+	asPlainText: boolean = false
+): string {
+	// Filter only text attachments (TEXT type and legacy CONTEXT type)
+	const textAttachments =
+		extras?.filter(
+			(extra): extra is DatabaseMessageExtraTextFile | DatabaseMessageExtraLegacyContext =>
+				extra.type === AttachmentType.TEXT || extra.type === AttachmentType.LEGACY_CONTEXT
+		) ?? [];
+
+	if (textAttachments.length === 0) {
+		return content;
+	}
+
+	if (asPlainText) {
+		const parts = [content];
+		for (const att of textAttachments) {
+			parts.push(att.content);
+		}
+		return parts.join('\n\n');
+	}
+
+	const clipboardAttachments: ClipboardTextAttachment[] = textAttachments.map((att) => ({
+		type: AttachmentType.TEXT,
+		name: att.name,
+		content: att.content
+	}));
+
+	return `${JSON.stringify(content)}\n${JSON.stringify(clipboardAttachments, null, 2)}`;
+}
+
+/**
+ * Parses clipboard content to extract message and text attachments.
+ * Supports both plain text and the special format with attachments.
+ *
+ * @param clipboardText - Raw text from clipboard
+ * @returns Parsed content with message and attachments
+ */
+export function parseClipboardContent(clipboardText: string): ParsedClipboardContent {
+	const defaultResult: ParsedClipboardContent = {
+		message: clipboardText,
+		textAttachments: []
+	};
+
+	if (!clipboardText.startsWith('"')) {
+		return defaultResult;
+	}
+
+	try {
+		let stringEndIndex = -1;
+		let escaped = false;
+
+		for (let i = 1; i < clipboardText.length; i++) {
+			const char = clipboardText[i];
+
+			if (escaped) {
+				escaped = false;
+				continue;
+			}
+
+			if (char === '\\') {
+				escaped = true;
+				continue;
+			}
+
+			if (char === '"') {
+				stringEndIndex = i;
+				break;
+			}
+		}
+
+		if (stringEndIndex === -1) {
+			return defaultResult;
+		}
+
+		const jsonStringPart = clipboardText.substring(0, stringEndIndex + 1);
+		const remainingPart = clipboardText.substring(stringEndIndex + 1).trim();
+
+		const message = JSON.parse(jsonStringPart) as string;
+
+		if (!remainingPart || !remainingPart.startsWith('[')) {
+			return {
+				message,
+				textAttachments: []
+			};
+		}
+
+		const attachments = JSON.parse(remainingPart) as unknown[];
+
+		const validAttachments: ClipboardTextAttachment[] = [];
+
+		for (const att of attachments) {
+			if (isValidTextAttachment(att)) {
+				validAttachments.push({
+					type: AttachmentType.TEXT,
+					name: att.name,
+					content: att.content
+				});
+			}
+		}
+
+		return {
+			message,
+			textAttachments: validAttachments
+		};
+	} catch {
+		return defaultResult;
+	}
+}
+
+/**
+ * Type guard to validate a text attachment object
+ * @param obj The object to validate
+ * @returns true if the object is a valid text attachment
+ */
+function isValidTextAttachment(
+	obj: unknown
+): obj is { type: string; name: string; content: string } {
+	if (typeof obj !== 'object' || obj === null) {
+		return false;
+	}
+
+	const record = obj as Record<string, unknown>;
+
+	return (
+		(record.type === AttachmentType.TEXT || record.type === 'TEXT') &&
+		typeof record.name === 'string' &&
+		typeof record.content === 'string'
+	);
+}
+
+/**
+ * Checks if clipboard content contains our special format with attachments
+ * @param clipboardText - Raw text from clipboard
+ * @returns true if the clipboard content contains our special format with attachments
+ */
+export function hasClipboardAttachments(clipboardText: string): boolean {
+	if (!clipboardText.startsWith('"')) {
+		return false;
+	}
+
+	const parsed = parseClipboardContent(clipboardText);
+	return parsed.textAttachments.length > 0;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/config-helpers.ts b/llama.cpp/tools/server/webui/src/lib/utils/config-helpers.ts
new file mode 100644
index 0000000..b85242d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/config-helpers.ts
@@ -0,0 +1,51 @@
+/**
+ * Type-safe configuration helpers
+ *
+ * Provides utilities for safely accessing and modifying configuration objects
+ * with dynamic keys while maintaining TypeScript type safety.
+ */
+
+/**
+ * Type-safe helper to access config properties dynamically
+ * Provides better type safety than direct casting to Record
+ */
+export function setConfigValue<T extends SettingsConfigType>(
+	config: T,
+	key: string,
+	value: unknown
+): void {
+	if (key in config) {
+		(config as Record<string, unknown>)[key] = value;
+	}
+}
+
+/**
+ * Type-safe helper to get config values dynamically
+ */
+export function getConfigValue<T extends SettingsConfigType>(
+	config: T,
+	key: string
+): string | number | boolean | undefined {
+	const value = (config as Record<string, unknown>)[key];
+	return value as string | number | boolean | undefined;
+}
+
+/**
+ * Convert a SettingsConfigType to a ParameterRecord for specific keys
+ * Useful for parameter synchronization operations
+ */
+export function configToParameterRecord<T extends SettingsConfigType>(
+	config: T,
+	keys: string[]
+): Record<string, string | number | boolean> {
+	const record: Record<string, string | number | boolean> = {};
+
+	for (const key of keys) {
+		const value = getConfigValue(config, key);
+		if (value !== undefined) {
+			record[key] = value;
+		}
+	}
+
+	return record;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/conversation-utils.ts b/llama.cpp/tools/server/webui/src/lib/utils/conversation-utils.ts
new file mode 100644
index 0000000..aee244a
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/conversation-utils.ts
@@ -0,0 +1,30 @@
+/**
+ * Utility functions for conversation data manipulation
+ */
+
+/**
+ * Creates a map of conversation IDs to their message counts from exported conversation data
+ * @param exportedData - Array of exported conversations with their messages
+ * @returns Map of conversation ID to message count
+ */
+export function createMessageCountMap(
+	exportedData: Array<{ conv: DatabaseConversation; messages: DatabaseMessage[] }>
+): Map<string, number> {
+	const countMap = new Map<string, number>();
+
+	for (const item of exportedData) {
+		countMap.set(item.conv.id, item.messages.length);
+	}
+
+	return countMap;
+}
+
+/**
+ * Gets the message count for a specific conversation from the count map
+ * @param conversationId - The ID of the conversation
+ * @param countMap - Map of conversation IDs to message counts
+ * @returns The message count, or 0 if not found
+ */
+export function getMessageCount(conversationId: string, countMap: Map<string, number>): number {
+	return countMap.get(conversationId) ?? 0;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/convert-files-to-extra.ts b/llama.cpp/tools/server/webui/src/lib/utils/convert-files-to-extra.ts
new file mode 100644
index 0000000..6eb50f6
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/convert-files-to-extra.ts
@@ -0,0 +1,192 @@
+import { convertPDFToImage, convertPDFToText } from './pdf-processing';
+import { isSvgMimeType, svgBase64UrlToPngDataURL } from './svg-to-png';
+import { isWebpMimeType, webpBase64UrlToPngDataURL } from './webp-to-png';
+import { FileTypeCategory, AttachmentType } from '$lib/enums';
+import { config, settingsStore } from '$lib/stores/settings.svelte';
+import { modelsStore } from '$lib/stores/models.svelte';
+import { getFileTypeCategory } from '$lib/utils';
+import { readFileAsText, isLikelyTextFile } from './text-files';
+import { toast } from 'svelte-sonner';
+
+function readFileAsBase64(file: File): Promise<string> {
+	return new Promise((resolve, reject) => {
+		const reader = new FileReader();
+
+		reader.onload = () => {
+			// Extract base64 data without the data URL prefix
+			const dataUrl = reader.result as string;
+			const base64 = dataUrl.split(',')[1];
+			resolve(base64);
+		};
+
+		reader.onerror = () => reject(reader.error);
+
+		reader.readAsDataURL(file);
+	});
+}
+
+export interface FileProcessingResult {
+	extras: DatabaseMessageExtra[];
+	emptyFiles: string[];
+}
+
+export async function parseFilesToMessageExtras(
+	files: ChatUploadedFile[],
+	activeModelId?: string
+): Promise<FileProcessingResult> {
+	const extras: DatabaseMessageExtra[] = [];
+	const emptyFiles: string[] = [];
+
+	for (const file of files) {
+		if (getFileTypeCategory(file.type) === FileTypeCategory.IMAGE) {
+			if (file.preview) {
+				let base64Url = file.preview;
+
+				if (isSvgMimeType(file.type)) {
+					try {
+						base64Url = await svgBase64UrlToPngDataURL(base64Url);
+					} catch (error) {
+						console.error('Failed to convert SVG to PNG for database storage:', error);
+					}
+				} else if (isWebpMimeType(file.type)) {
+					try {
+						base64Url = await webpBase64UrlToPngDataURL(base64Url);
+					} catch (error) {
+						console.error('Failed to convert WebP to PNG for database storage:', error);
+					}
+				}
+
+				extras.push({
+					type: AttachmentType.IMAGE,
+					name: file.name,
+					base64Url
+				});
+			}
+		} else if (getFileTypeCategory(file.type) === FileTypeCategory.AUDIO) {
+			// Process audio files (MP3 and WAV)
+			try {
+				const base64Data = await readFileAsBase64(file.file);
+
+				extras.push({
+					type: AttachmentType.AUDIO,
+					name: file.name,
+					base64Data: base64Data,
+					mimeType: file.type
+				});
+			} catch (error) {
+				console.error(`Failed to process audio file ${file.name}:`, error);
+			}
+		} else if (getFileTypeCategory(file.type) === FileTypeCategory.PDF) {
+			try {
+				// Always get base64 data for preview functionality
+				const base64Data = await readFileAsBase64(file.file);
+				const currentConfig = config();
+				// Use per-model vision check for router mode
+				const hasVisionSupport = activeModelId
+					? modelsStore.modelSupportsVision(activeModelId)
+					: false;
+
+				// Force PDF-to-text for non-vision models
+				let shouldProcessAsImages = Boolean(currentConfig.pdfAsImage) && hasVisionSupport;
+
+				// If user had pdfAsImage enabled but model doesn't support vision, update setting and notify
+				if (currentConfig.pdfAsImage && !hasVisionSupport) {
+					console.log('Non-vision model detected: forcing PDF-to-text mode and updating settings');
+
+					// Update the setting in localStorage
+					settingsStore.updateConfig('pdfAsImage', false);
+
+					// Show toast notification to user
+					toast.warning(
+						'PDF setting changed: Non-vision model detected, PDFs will be processed as text instead of images.',
+						{
+							duration: 5000
+						}
+					);
+
+					shouldProcessAsImages = false;
+				}
+
+				if (shouldProcessAsImages) {
+					// Process PDF as images (only for vision models)
+					try {
+						const images = await convertPDFToImage(file.file);
+
+						// Show success toast for PDF image processing
+						toast.success(
+							`PDF "${file.name}" processed as ${images.length} images for vision model.`,
+							{
+								duration: 3000
+							}
+						);
+
+						extras.push({
+							type: AttachmentType.PDF,
+							name: file.name,
+							content: `PDF file with ${images.length} pages`,
+							images: images,
+							processedAsImages: true,
+							base64Data: base64Data
+						});
+					} catch (imageError) {
+						console.warn(
+							`Failed to process PDF ${file.name} as images, falling back to text:`,
+							imageError
+						);
+
+						// Fallback to text processing
+						const content = await convertPDFToText(file.file);
+
+						extras.push({
+							type: AttachmentType.PDF,
+							name: file.name,
+							content: content,
+							processedAsImages: false,
+							base64Data: base64Data
+						});
+					}
+				} else {
+					// Process PDF as text (default or forced for non-vision models)
+					const content = await convertPDFToText(file.file);
+
+					// Show success toast for PDF text processing
+					toast.success(`PDF "${file.name}" processed as text content.`, {
+						duration: 3000
+					});
+
+					extras.push({
+						type: AttachmentType.PDF,
+						name: file.name,
+						content: content,
+						processedAsImages: false,
+						base64Data: base64Data
+					});
+				}
+			} catch (error) {
+				console.error(`Failed to process PDF file ${file.name}:`, error);
+			}
+		} else {
+			try {
+				const content = await readFileAsText(file.file);
+
+				// Check if file is empty
+				if (content.trim() === '') {
+					console.warn(`File ${file.name} is empty and will be skipped`);
+					emptyFiles.push(file.name);
+				} else if (isLikelyTextFile(content)) {
+					extras.push({
+						type: AttachmentType.TEXT,
+						name: file.name,
+						content: content
+					});
+				} else {
+					console.warn(`File ${file.name} appears to be binary and will be skipped`);
+				}
+			} catch (error) {
+				console.error(`Failed to read file ${file.name}:`, error);
+			}
+		}
+	}
+
+	return { extras, emptyFiles };
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/file-preview.ts b/llama.cpp/tools/server/webui/src/lib/utils/file-preview.ts
new file mode 100644
index 0000000..26a6053
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/file-preview.ts
@@ -0,0 +1,36 @@
+/**
+ * Gets a display label for a file type from various input formats
+ *
+ * Handles:
+ * - MIME types: 'application/pdf' → 'PDF'
+ * - AttachmentType values: 'PDF', 'AUDIO' → 'PDF', 'AUDIO'
+ * - File names: 'document.pdf' → 'PDF'
+ * - Unknown: returns 'FILE'
+ *
+ * @param input - MIME type, AttachmentType value, or file name
+ * @returns Formatted file type label (uppercase)
+ */
+export function getFileTypeLabel(input: string | undefined): string {
+	if (!input) return 'FILE';
+
+	// Handle MIME types (contains '/')
+	if (input.includes('/')) {
+		const subtype = input.split('/').pop();
+		if (subtype) {
+			// Handle special cases like 'vnd.ms-excel' → 'EXCEL'
+			if (subtype.includes('.')) {
+				return subtype.split('.').pop()?.toUpperCase() || 'FILE';
+			}
+			return subtype.toUpperCase();
+		}
+	}
+
+	// Handle file names (contains '.')
+	if (input.includes('.')) {
+		const ext = input.split('.').pop();
+		if (ext) return ext.toUpperCase();
+	}
+
+	// Handle AttachmentType or other plain strings
+	return input.toUpperCase();
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/file-type.ts b/llama.cpp/tools/server/webui/src/lib/utils/file-type.ts
new file mode 100644
index 0000000..9a9996d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/file-type.ts
@@ -0,0 +1,222 @@
+import {
+	AUDIO_FILE_TYPES,
+	IMAGE_FILE_TYPES,
+	PDF_FILE_TYPES,
+	TEXT_FILE_TYPES
+} from '$lib/constants/supported-file-types';
+import {
+	FileExtensionAudio,
+	FileExtensionImage,
+	FileExtensionPdf,
+	FileExtensionText,
+	FileTypeCategory,
+	MimeTypeApplication,
+	MimeTypeAudio,
+	MimeTypeImage,
+	MimeTypeText
+} from '$lib/enums';
+
+export function getFileTypeCategory(mimeType: string): FileTypeCategory | null {
+	switch (mimeType) {
+		// Images
+		case MimeTypeImage.JPEG:
+		case MimeTypeImage.PNG:
+		case MimeTypeImage.GIF:
+		case MimeTypeImage.WEBP:
+		case MimeTypeImage.SVG:
+			return FileTypeCategory.IMAGE;
+
+		// Audio
+		case MimeTypeAudio.MP3_MPEG:
+		case MimeTypeAudio.MP3:
+		case MimeTypeAudio.MP4:
+		case MimeTypeAudio.WAV:
+		case MimeTypeAudio.WEBM:
+		case MimeTypeAudio.WEBM_OPUS:
+			return FileTypeCategory.AUDIO;
+
+		// PDF
+		case MimeTypeApplication.PDF:
+			return FileTypeCategory.PDF;
+
+		// Text
+		case MimeTypeText.PLAIN:
+		case MimeTypeText.MARKDOWN:
+		case MimeTypeText.ASCIIDOC:
+		case MimeTypeText.JAVASCRIPT:
+		case MimeTypeText.JAVASCRIPT_APP:
+		case MimeTypeText.TYPESCRIPT:
+		case MimeTypeText.JSX:
+		case MimeTypeText.TSX:
+		case MimeTypeText.CSS:
+		case MimeTypeText.HTML:
+		case MimeTypeText.JSON:
+		case MimeTypeText.XML_TEXT:
+		case MimeTypeText.XML_APP:
+		case MimeTypeText.YAML_TEXT:
+		case MimeTypeText.YAML_APP:
+		case MimeTypeText.CSV:
+		case MimeTypeText.PYTHON:
+		case MimeTypeText.JAVA:
+		case MimeTypeText.CPP_SRC:
+		case MimeTypeText.C_SRC:
+		case MimeTypeText.C_HDR:
+		case MimeTypeText.PHP:
+		case MimeTypeText.RUBY:
+		case MimeTypeText.GO:
+		case MimeTypeText.RUST:
+		case MimeTypeText.SHELL:
+		case MimeTypeText.BAT:
+		case MimeTypeText.SQL:
+		case MimeTypeText.R:
+		case MimeTypeText.SCALA:
+		case MimeTypeText.KOTLIN:
+		case MimeTypeText.SWIFT:
+		case MimeTypeText.DART:
+		case MimeTypeText.VUE:
+		case MimeTypeText.SVELTE:
+		case MimeTypeText.LATEX:
+		case MimeTypeText.BIBTEX:
+		case MimeTypeText.CUDA:
+		case MimeTypeText.CPP_HDR:
+		case MimeTypeText.CSHARP:
+		case MimeTypeText.HASKELL:
+		case MimeTypeText.PROPERTIES:
+		case MimeTypeText.TEX:
+		case MimeTypeText.TEX_APP:
+			return FileTypeCategory.TEXT;
+
+		default:
+			return null;
+	}
+}
+
+export function getFileTypeCategoryByExtension(filename: string): FileTypeCategory | null {
+	const extension = filename.toLowerCase().substring(filename.lastIndexOf('.'));
+
+	switch (extension) {
+		// Images
+		case FileExtensionImage.JPG:
+		case FileExtensionImage.JPEG:
+		case FileExtensionImage.PNG:
+		case FileExtensionImage.GIF:
+		case FileExtensionImage.WEBP:
+		case FileExtensionImage.SVG:
+			return FileTypeCategory.IMAGE;
+
+		// Audio
+		case FileExtensionAudio.MP3:
+		case FileExtensionAudio.WAV:
+			return FileTypeCategory.AUDIO;
+
+		// PDF
+		case FileExtensionPdf.PDF:
+			return FileTypeCategory.PDF;
+
+		// Text
+		case FileExtensionText.TXT:
+		case FileExtensionText.MD:
+		case FileExtensionText.ADOC:
+		case FileExtensionText.JS:
+		case FileExtensionText.TS:
+		case FileExtensionText.JSX:
+		case FileExtensionText.TSX:
+		case FileExtensionText.CSS:
+		case FileExtensionText.HTML:
+		case FileExtensionText.HTM:
+		case FileExtensionText.JSON:
+		case FileExtensionText.XML:
+		case FileExtensionText.YAML:
+		case FileExtensionText.YML:
+		case FileExtensionText.CSV:
+		case FileExtensionText.LOG:
+		case FileExtensionText.PY:
+		case FileExtensionText.JAVA:
+		case FileExtensionText.CPP:
+		case FileExtensionText.C:
+		case FileExtensionText.H:
+		case FileExtensionText.PHP:
+		case FileExtensionText.RB:
+		case FileExtensionText.GO:
+		case FileExtensionText.RS:
+		case FileExtensionText.SH:
+		case FileExtensionText.BAT:
+		case FileExtensionText.SQL:
+		case FileExtensionText.R:
+		case FileExtensionText.SCALA:
+		case FileExtensionText.KT:
+		case FileExtensionText.SWIFT:
+		case FileExtensionText.DART:
+		case FileExtensionText.VUE:
+		case FileExtensionText.SVELTE:
+		case FileExtensionText.TEX:
+		case FileExtensionText.BIB:
+		case FileExtensionText.COMP:
+		case FileExtensionText.CU:
+		case FileExtensionText.CUH:
+		case FileExtensionText.HPP:
+		case FileExtensionText.HS:
+		case FileExtensionText.PROPERTIES:
+			return FileTypeCategory.TEXT;
+
+		default:
+			return null;
+	}
+}
+
+export function getFileTypeByExtension(filename: string): string | null {
+	const extension = filename.toLowerCase().substring(filename.lastIndexOf('.'));
+
+	for (const [key, type] of Object.entries(IMAGE_FILE_TYPES)) {
+		if ((type.extensions as readonly string[]).includes(extension)) {
+			return `${FileTypeCategory.IMAGE}:${key}`;
+		}
+	}
+
+	for (const [key, type] of Object.entries(AUDIO_FILE_TYPES)) {
+		if ((type.extensions as readonly string[]).includes(extension)) {
+			return `${FileTypeCategory.AUDIO}:${key}`;
+		}
+	}
+
+	for (const [key, type] of Object.entries(PDF_FILE_TYPES)) {
+		if ((type.extensions as readonly string[]).includes(extension)) {
+			return `${FileTypeCategory.PDF}:${key}`;
+		}
+	}
+
+	for (const [key, type] of Object.entries(TEXT_FILE_TYPES)) {
+		if ((type.extensions as readonly string[]).includes(extension)) {
+			return `${FileTypeCategory.TEXT}:${key}`;
+		}
+	}
+
+	return null;
+}
+
+export function isFileTypeSupported(filename: string, mimeType?: string): boolean {
+	// Images are detected and handled separately for vision models
+	if (mimeType) {
+		const category = getFileTypeCategory(mimeType);
+		if (
+			category === FileTypeCategory.IMAGE ||
+			category === FileTypeCategory.AUDIO ||
+			category === FileTypeCategory.PDF
+		) {
+			return true;
+		}
+	}
+
+	// Check extension for known types (especially images without MIME)
+	const extCategory = getFileTypeCategoryByExtension(filename);
+	if (
+		extCategory === FileTypeCategory.IMAGE ||
+		extCategory === FileTypeCategory.AUDIO ||
+		extCategory === FileTypeCategory.PDF
+	) {
+		return true;
+	}
+
+	// Fallback: treat everything else as text (inclusive by default)
+	return true;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/formatters.ts b/llama.cpp/tools/server/webui/src/lib/utils/formatters.ts
new file mode 100644
index 0000000..ae9f59a
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/formatters.ts
@@ -0,0 +1,53 @@
+/**
+ * Formats file size in bytes to human readable format
+ * Supports Bytes, KB, MB, and GB
+ *
+ * @param bytes - File size in bytes (or unknown for safety)
+ * @returns Formatted file size string
+ */
+export function formatFileSize(bytes: number | unknown): string {
+	if (typeof bytes !== 'number') return 'Unknown';
+	if (bytes === 0) return '0 Bytes';
+
+	const k = 1024;
+	const sizes = ['Bytes', 'KB', 'MB', 'GB'];
+	const i = Math.floor(Math.log(bytes) / Math.log(k));
+
+	return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
+}
+
+/**
+ * Format parameter count to human-readable format (B, M, K)
+ *
+ * @param params - Parameter count
+ * @returns Human-readable parameter count
+ */
+export function formatParameters(params: number | unknown): string {
+	if (typeof params !== 'number') return 'Unknown';
+
+	if (params >= 1e9) {
+		return `${(params / 1e9).toFixed(2)}B`;
+	}
+
+	if (params >= 1e6) {
+		return `${(params / 1e6).toFixed(2)}M`;
+	}
+
+	if (params >= 1e3) {
+		return `${(params / 1e3).toFixed(2)}K`;
+	}
+
+	return params.toString();
+}
+
+/**
+ * Format number with locale-specific thousands separators
+ *
+ * @param num - Number to format
+ * @returns Human-readable number
+ */
+export function formatNumber(num: number | unknown): string {
+	if (typeof num !== 'number') return 'Unknown';
+
+	return num.toLocaleString();
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/index.ts b/llama.cpp/tools/server/webui/src/lib/utils/index.ts
new file mode 100644
index 0000000..588167b
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/index.ts
@@ -0,0 +1,95 @@
+/**
+ * Unified exports for all utility functions
+ * Import utilities from '$lib/utils' for cleaner imports
+ *
+ * For browser-only utilities (pdf-processing, audio-recording, svg-to-png,
+ * webp-to-png, process-uploaded-files, convert-files-to-extra), use:
+ * import { ... } from '$lib/utils/browser-only'
+ */
+
+// API utilities
+export { getAuthHeaders, getJsonHeaders } from './api-headers';
+export { validateApiKey } from './api-key-validation';
+
+// Attachment utilities
+export {
+	getAttachmentDisplayItems,
+	type AttachmentDisplayItemsOptions
+} from './attachment-display';
+export { isTextFile, isImageFile, isPdfFile, isAudioFile } from './attachment-type';
+
+// Textarea utilities
+export { default as autoResizeTextarea } from './autoresize-textarea';
+
+// Branching utilities
+export {
+	filterByLeafNodeId,
+	findLeafNode,
+	findDescendantMessages,
+	getMessageSiblings,
+	getMessageDisplayList,
+	hasMessageSiblings,
+	getNextSibling,
+	getPreviousSibling
+} from './branching';
+
+// Config helpers
+export { setConfigValue, getConfigValue, configToParameterRecord } from './config-helpers';
+
+// Conversation utilities
+export { createMessageCountMap, getMessageCount } from './conversation-utils';
+
+// Clipboard utilities
+export {
+	copyToClipboard,
+	copyCodeToClipboard,
+	formatMessageForClipboard,
+	parseClipboardContent,
+	hasClipboardAttachments,
+	type ClipboardTextAttachment,
+	type ParsedClipboardContent
+} from './clipboard';
+
+// File preview utilities
+export { getFileTypeLabel } from './file-preview';
+export { getPreviewText } from './text';
+
+// File type utilities
+export {
+	getFileTypeCategory,
+	getFileTypeCategoryByExtension,
+	getFileTypeByExtension,
+	isFileTypeSupported
+} from './file-type';
+
+// Formatting utilities
+export { formatFileSize, formatParameters, formatNumber } from './formatters';
+
+// IME utilities
+export { isIMEComposing } from './is-ime-composing';
+
+// LaTeX utilities
+export { maskInlineLaTeX, preprocessLaTeX } from './latex-protection';
+
+// Modality file validation utilities
+export {
+	isFileTypeSupportedByModel,
+	filterFilesByModalities,
+	generateModalityErrorMessage,
+	type ModalityCapabilities
+} from './modality-file-validation';
+
+// Model name utilities
+export { normalizeModelName, isValidModelName } from './model-names';
+
+// Portal utilities
+export { portalToBody } from './portal-to-body';
+
+// Precision utilities
+export { normalizeFloatingPoint, normalizeNumber } from './precision';
+
+// Syntax highlighting utilities
+export { getLanguageFromFilename } from './syntax-highlight-language';
+
+// Text file utilities
+export { isTextFileByName, readFileAsText, isLikelyTextFile } from './text-files';
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/is-ime-composing.ts b/llama.cpp/tools/server/webui/src/lib/utils/is-ime-composing.ts
new file mode 100644
index 0000000..9182ea4
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/is-ime-composing.ts
@@ -0,0 +1,5 @@
+export function isIMEComposing(event: KeyboardEvent) {
+	// Check for IME composition using isComposing property and keyCode 229 (specifically for IME composition on Safari, which is notorious for not supporting KeyboardEvent.isComposing)
+	// This prevents form submission when confirming IME word selection (e.g., Japanese/Chinese input)
+	return event.isComposing || event.keyCode === 229;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts b/llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts
new file mode 100644
index 0000000..cafa2d4
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts
@@ -0,0 +1,270 @@
+import {
+	CODE_BLOCK_REGEXP,
+	LATEX_MATH_AND_CODE_PATTERN,
+	LATEX_LINEBREAK_REGEXP,
+	MHCHEM_PATTERN_MAP
+} from '$lib/constants/latex-protection';
+
+/**
+ * Replaces inline LaTeX expressions enclosed in `$...$` with placeholders, avoiding dollar signs
+ * that appear to be part of monetary values or identifiers.
+ *
+ * This function processes the input line by line and skips `$` sequences that are likely
+ * part of money amounts (e.g., `$5`, `$100.99`) or code-like tokens (e.g., `var$`, `$var`).
+ * Valid LaTeX inline math is replaced with a placeholder like `<<LATEX_0>>`, and the
+ * actual LaTeX content is stored in the provided `latexExpressions` array.
+ *
+ * @param content - The input text potentially containing LaTeX expressions.
+ * @param latexExpressions - An array used to collect extracted LaTeX expressions.
+ * @returns The processed string with LaTeX replaced by placeholders.
+ */
+export function maskInlineLaTeX(content: string, latexExpressions: string[]): string {
+	if (!content.includes('$')) {
+		return content;
+	}
+	return content
+		.split('\n')
+		.map((line) => {
+			if (line.indexOf('$') == -1) {
+				return line;
+			}
+
+			let processedLine = '';
+			let currentPosition = 0;
+
+			while (currentPosition < line.length) {
+				const openDollarIndex = line.indexOf('$', currentPosition);
+
+				if (openDollarIndex == -1) {
+					processedLine += line.slice(currentPosition);
+					break;
+				}
+
+				// Is there a next $-sign?
+				const closeDollarIndex = line.indexOf('$', openDollarIndex + 1);
+
+				if (closeDollarIndex == -1) {
+					processedLine += line.slice(currentPosition);
+					break;
+				}
+
+				const charBeforeOpen = openDollarIndex > 0 ? line[openDollarIndex - 1] : '';
+				const charAfterOpen = line[openDollarIndex + 1];
+				const charBeforeClose =
+					openDollarIndex + 1 < closeDollarIndex ? line[closeDollarIndex - 1] : '';
+				const charAfterClose = closeDollarIndex + 1 < line.length ? line[closeDollarIndex + 1] : '';
+
+				let shouldSkipAsNonLatex = false;
+
+				if (closeDollarIndex == currentPosition + 1) {
+					// No content
+					shouldSkipAsNonLatex = true;
+				}
+
+				if (/[A-Za-z0-9_$-]/.test(charBeforeOpen)) {
+					// Character, digit, $, _ or - before first '$', no TeX.
+					shouldSkipAsNonLatex = true;
+				}
+
+				if (
+					/[0-9]/.test(charAfterOpen) &&
+					(/[A-Za-z0-9_$-]/.test(charAfterClose) || ' ' == charBeforeClose)
+				) {
+					// First $ seems to belong to an amount.
+					shouldSkipAsNonLatex = true;
+				}
+
+				if (shouldSkipAsNonLatex) {
+					processedLine += line.slice(currentPosition, openDollarIndex + 1);
+					currentPosition = openDollarIndex + 1;
+
+					continue;
+				}
+
+				// Treat as LaTeX
+				processedLine += line.slice(currentPosition, openDollarIndex);
+				const latexContent = line.slice(openDollarIndex, closeDollarIndex + 1);
+				latexExpressions.push(latexContent);
+				processedLine += `<<LATEX_${latexExpressions.length - 1}>>`;
+				currentPosition = closeDollarIndex + 1;
+			}
+
+			return processedLine;
+		})
+		.join('\n');
+}
+
+function escapeBrackets(text: string): string {
+	return text.replace(
+		LATEX_MATH_AND_CODE_PATTERN,
+		(
+			match: string,
+			codeBlock: string | undefined,
+			squareBracket: string | undefined,
+			roundBracket: string | undefined
+		): string => {
+			if (codeBlock != null) {
+				return codeBlock;
+			} else if (squareBracket != null) {
+				return `$$${squareBracket}$$`;
+			} else if (roundBracket != null) {
+				return `$${roundBracket}$`;
+			}
+
+			return match;
+		}
+	);
+}
+
+// Escape $\\ce{...} → $\\ce{...} but with proper handling
+function escapeMhchem(text: string): string {
+	return MHCHEM_PATTERN_MAP.reduce((result, [pattern, replacement]) => {
+		return result.replace(pattern, replacement);
+	}, text);
+}
+
+const doEscapeMhchem = false;
+
+/**
+ * Preprocesses markdown content to safely handle LaTeX math expressions while protecting
+ * against false positives (e.g., dollar amounts like $5.99) and ensuring proper rendering.
+ *
+ * This function:
+ * - Protects code blocks (```) and inline code (`...`)
+ * - Safeguards block and inline LaTeX: \(...\), \[...\], $$...$$, and selective $...$
+ * - Escapes standalone dollar signs before numbers (e.g., $5 → \$5) to prevent misinterpretation
+ * - Restores protected LaTeX and code blocks after processing
+ * - Converts \(...\) → $...$ and \[...\] → $$...$$ for compatibility with math renderers
+ * - Applies additional escaping for brackets and mhchem syntax if needed
+ *
+ * @param content - The raw text (e.g., markdown) that may contain LaTeX or code blocks.
+ * @returns The preprocessed string with properly escaped and normalized LaTeX.
+ *
+ * @example
+ * preprocessLaTeX("Price: $10. The equation is \\(x^2\\).")
+ * // → "Price: $10. The equation is $x^2$."
+ */
+export function preprocessLaTeX(content: string): string {
+	// See also:
+	// https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts
+
+	// Step 0: Temporarily remove blockquote markers (>) to process LaTeX correctly
+	// Store the structure so we can restore it later
+	const blockquoteMarkers: Map<number, string> = new Map();
+	const lines = content.split('\n');
+	const processedLines = lines.map((line, index) => {
+		const match = line.match(/^(>\s*)/);
+		if (match) {
+			blockquoteMarkers.set(index, match[1]);
+			return line.slice(match[1].length);
+		}
+		return line;
+	});
+	content = processedLines.join('\n');
+
+	// Step 1: Protect code blocks
+	const codeBlocks: string[] = [];
+
+	content = content.replace(CODE_BLOCK_REGEXP, (match) => {
+		codeBlocks.push(match);
+
+		return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`;
+	});
+
+	// Step 2: Protect existing LaTeX expressions
+	const latexExpressions: string[] = [];
+
+	// Match \S...\[...\] and protect them and insert a line-break.
+	content = content.replace(/([\S].*?)\\\[([\s\S]*?)\\\](.*)/g, (match, group1, group2, group3) => {
+		// Check if there are characters following the formula (display-formula in a table-cell?)
+		if (group1.endsWith('\\')) {
+			return match; // Backslash before \[, do nothing.
+		}
+		const hasSuffix = /\S/.test(group3);
+		let optBreak;
+
+		if (hasSuffix) {
+			latexExpressions.push(`\\(${group2.trim()}\\)`); // Convert into inline.
+			optBreak = '';
+		} else {
+			latexExpressions.push(`\\[${group2}\\]`);
+			optBreak = '\n';
+		}
+
+		return `${group1}${optBreak}<<LATEX_${latexExpressions.length - 1}>>${optBreak}${group3}`;
+	});
+
+	// Match \(...\), \[...\], $$...$$ and protect them
+	content = content.replace(
+		/(\$\$[\s\S]*?\$\$|(?<!\\)\\\[[\s\S]*?\\\]|(?<!\\)\\\(.*?\\\))/g,
+		(match) => {
+			latexExpressions.push(match);
+
+			return `<<LATEX_${latexExpressions.length - 1}>>`;
+		}
+	);
+
+	// Protect inline $...$ but NOT if it looks like money (e.g., $10, $3.99)
+	content = maskInlineLaTeX(content, latexExpressions);
+
+	// Step 3: Escape standalone $ before digits (currency like $5 → \$5)
+	// (Now that inline math is protected, this will only escape dollars not already protected)
+	content = content.replace(/\$(?=\d)/g, '\\$');
+
+	// Step 4: Restore protected LaTeX expressions (they are valid)
+	content = content.replace(/<<LATEX_(\d+)>>/g, (_, index) => {
+		let expr = latexExpressions[parseInt(index)];
+		const match = expr.match(LATEX_LINEBREAK_REGEXP);
+		if (match) {
+			// Katex: The $$-delimiters should be in their own line
+			// if there are \\-line-breaks.
+			const formula = match[1];
+			const prefix = formula.startsWith('\n') ? '' : '\n';
+			const suffix = formula.endsWith('\n') ? '' : '\n';
+			expr = '$$' + prefix + formula + suffix + '$$';
+		}
+		return expr;
+	});
+
+	// Step 5: Apply additional escaping functions (brackets and mhchem)
+	// This must happen BEFORE restoring code blocks to avoid affecting code content
+	content = escapeBrackets(content);
+
+	if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) {
+		content = escapeMhchem(content);
+	}
+
+	// Step 6: Convert remaining \(...\) → $...$, \[...\] → $$...$$
+	// This must happen BEFORE restoring code blocks to avoid affecting code content
+	content = content
+		// Using the look‑behind pattern `(?<!\\)` we skip matches
+		// that are preceded by a backslash, e.g.
+		// `Definitions\\(also called macros)` (title of chapter 20 in The TeXbook).
+		.replace(/(?<!\\)\\\((.+?)\\\)/g, '$$$1$') // inline
+		.replace(
+			// Using the look‑behind pattern `(?<!\\)` we skip matches
+			// that are preceded by a backslash, e.g. `\\[4pt]`.
+			/(?<!\\)\\\[([\s\S]*?)\\\]/g, // display, see also PR #16599
+			(_, content: string) => {
+				return `$$${content}$$`;
+			}
+		);
+
+	// Step 7: Restore code blocks
+	// This happens AFTER all LaTeX conversions to preserve code content
+	content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => {
+		return codeBlocks[parseInt(index)];
+	});
+
+	// Step 8: Restore blockquote markers
+	if (blockquoteMarkers.size > 0) {
+		const finalLines = content.split('\n');
+		const restoredLines = finalLines.map((line, index) => {
+			const marker = blockquoteMarkers.get(index);
+			return marker ? marker + line : line;
+		});
+		content = restoredLines.join('\n');
+	}
+
+	return content;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/modality-file-validation.ts b/llama.cpp/tools/server/webui/src/lib/utils/modality-file-validation.ts
new file mode 100644
index 0000000..136c084
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/modality-file-validation.ts
@@ -0,0 +1,162 @@
+/**
+ * File validation utilities based on model modalities
+ * Ensures only compatible file types are processed based on model capabilities
+ */
+
+import { getFileTypeCategory } from '$lib/utils';
+import { FileTypeCategory } from '$lib/enums';
+
+/** Modality capabilities for file validation */
+export interface ModalityCapabilities {
+	hasVision: boolean;
+	hasAudio: boolean;
+}
+
+/**
+ * Check if a file type is supported by the given modalities
+ * @param filename - The filename to check
+ * @param mimeType - The MIME type of the file
+ * @param capabilities - The modality capabilities to check against
+ * @returns true if the file type is supported
+ */
+export function isFileTypeSupportedByModel(
+	filename: string,
+	mimeType: string | undefined,
+	capabilities: ModalityCapabilities
+): boolean {
+	const category = mimeType ? getFileTypeCategory(mimeType) : null;
+
+	// If we can't determine the category from MIME type, fall back to general support check
+	if (!category) {
+		// For unknown types, only allow if they might be text files
+		// This is a conservative approach for edge cases
+		return true; // Let the existing isFileTypeSupported handle this
+	}
+
+	switch (category) {
+		case FileTypeCategory.TEXT:
+			// Text files are always supported
+			return true;
+
+		case FileTypeCategory.PDF:
+			// PDFs are always supported (will be processed as text for non-vision models)
+			return true;
+
+		case FileTypeCategory.IMAGE:
+			// Images require vision support
+			return capabilities.hasVision;
+
+		case FileTypeCategory.AUDIO:
+			// Audio files require audio support
+			return capabilities.hasAudio;
+
+		default:
+			// Unknown categories - be conservative and allow
+			return true;
+	}
+}
+
+/**
+ * Filter files based on model modalities and return supported/unsupported lists
+ * @param files - Array of files to filter
+ * @param capabilities - The modality capabilities to check against
+ * @returns Object with supportedFiles and unsupportedFiles arrays
+ */
+export function filterFilesByModalities(
+	files: File[],
+	capabilities: ModalityCapabilities
+): {
+	supportedFiles: File[];
+	unsupportedFiles: File[];
+	modalityReasons: Record<string, string>;
+} {
+	const supportedFiles: File[] = [];
+	const unsupportedFiles: File[] = [];
+	const modalityReasons: Record<string, string> = {};
+
+	const { hasVision, hasAudio } = capabilities;
+
+	for (const file of files) {
+		const category = getFileTypeCategory(file.type);
+		let isSupported = true;
+		let reason = '';
+
+		switch (category) {
+			case FileTypeCategory.IMAGE:
+				if (!hasVision) {
+					isSupported = false;
+					reason = 'Images require a vision-capable model';
+				}
+				break;
+
+			case FileTypeCategory.AUDIO:
+				if (!hasAudio) {
+					isSupported = false;
+					reason = 'Audio files require an audio-capable model';
+				}
+				break;
+
+			case FileTypeCategory.TEXT:
+			case FileTypeCategory.PDF:
+				// Always supported
+				break;
+
+			default:
+				// For unknown types, check if it's a generally supported file type
+				// This handles edge cases and maintains backward compatibility
+				break;
+		}
+
+		if (isSupported) {
+			supportedFiles.push(file);
+		} else {
+			unsupportedFiles.push(file);
+			modalityReasons[file.name] = reason;
+		}
+	}
+
+	return { supportedFiles, unsupportedFiles, modalityReasons };
+}
+
+/**
+ * Generate a user-friendly error message for unsupported files
+ * @param unsupportedFiles - Array of unsupported files
+ * @param modalityReasons - Reasons why files are unsupported
+ * @param capabilities - The modality capabilities to check against
+ * @returns Formatted error message
+ */
+export function generateModalityErrorMessage(
+	unsupportedFiles: File[],
+	modalityReasons: Record<string, string>,
+	capabilities: ModalityCapabilities
+): string {
+	if (unsupportedFiles.length === 0) return '';
+
+	const { hasVision, hasAudio } = capabilities;
+
+	let message = '';
+
+	if (unsupportedFiles.length === 1) {
+		const file = unsupportedFiles[0];
+		const reason = modalityReasons[file.name];
+		message = `The file "${file.name}" cannot be uploaded: ${reason}.`;
+	} else {
+		const fileNames = unsupportedFiles.map((f) => f.name).join(', ');
+		message = `The following files cannot be uploaded: ${fileNames}.`;
+	}
+
+	// Add helpful information about what is supported
+	const supportedTypes: string[] = ['text files', 'PDFs'];
+	if (hasVision) supportedTypes.push('images');
+	if (hasAudio) supportedTypes.push('audio files');
+
+	message += ` This model supports: ${supportedTypes.join(', ')}.`;
+
+	return message;
+}
+
+/**
+ * Generate file input accept string based on model modalities
+ * @param capabilities - The modality capabilities to check against
+ * @returns Accept string for HTML file input element
+ */
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/model-names.ts b/llama.cpp/tools/server/webui/src/lib/utils/model-names.ts
new file mode 100644
index 0000000..c0a1e1c
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/model-names.ts
@@ -0,0 +1,56 @@
+/**
+ * Normalizes a model name by extracting the filename from a path, but preserves Hugging Face repository format.
+ *
+ * Handles both forward slashes (/) and backslashes (\) as path separators.
+ * - If the model name has exactly one slash (org/model format), preserves the full "org/model" name
+ * - If the model name has no slash or multiple slashes, extracts just the filename
+ * - If the model name is just a filename (no path), returns it as-is.
+ *
+ * @param modelName - The model name or path to normalize
+ * @returns The normalized model name
+ *
+ * @example
+ * normalizeModelName('models/llama-3.1-8b') // Returns: 'llama-3.1-8b' (multiple slashes -> filename)
+ * normalizeModelName('C:\\Models\\gpt-4') // Returns: 'gpt-4' (multiple slashes -> filename)
+ * normalizeModelName('meta-llama/Llama-3.1-8B') // Returns: 'meta-llama/Llama-3.1-8B' (Hugging Face format)
+ * normalizeModelName('simple-model') // Returns: 'simple-model' (no slash)
+ * normalizeModelName('  spaced  ') // Returns: 'spaced'
+ * normalizeModelName('') // Returns: ''
+ */
+export function normalizeModelName(modelName: string): string {
+	const trimmed = modelName.trim();
+
+	if (!trimmed) {
+		return '';
+	}
+
+	const segments = trimmed.split(/[\\/]/);
+
+	// If we have exactly 2 segments (one slash), treat it as Hugging Face repo format
+	// and preserve the full "org/model" format
+	if (segments.length === 2) {
+		const [org, model] = segments;
+		const trimmedOrg = org?.trim();
+		const trimmedModel = model?.trim();
+
+		if (trimmedOrg && trimmedModel) {
+			return `${trimmedOrg}/${trimmedModel}`;
+		}
+	}
+
+	// For other cases (no slash, or multiple slashes), extract just the filename
+	const candidate = segments.pop();
+	const normalized = candidate?.trim();
+
+	return normalized && normalized.length > 0 ? normalized : trimmed;
+}
+
+/**
+ * Validates if a model name is valid (non-empty after normalization).
+ *
+ * @param modelName - The model name to validate
+ * @returns true if valid, false otherwise
+ */
+export function isValidModelName(modelName: string): boolean {
+	return normalizeModelName(modelName).length > 0;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts b/llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts
new file mode 100644
index 0000000..84c456d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts
@@ -0,0 +1,150 @@
+/**
+ * PDF processing utilities using PDF.js
+ * Handles PDF text extraction and image conversion in the browser
+ */
+
+import { browser } from '$app/environment';
+import { MimeTypeApplication, MimeTypeImage } from '$lib/enums';
+import * as pdfjs from 'pdfjs-dist';
+
+type TextContent = {
+	items: Array<{ str: string }>;
+};
+
+if (browser) {
+	// Import worker as text and create blob URL for inline bundling
+	import('pdfjs-dist/build/pdf.worker.min.mjs?raw')
+		.then((workerModule) => {
+			const workerBlob = new Blob([workerModule.default], { type: 'application/javascript' });
+			pdfjs.GlobalWorkerOptions.workerSrc = URL.createObjectURL(workerBlob);
+		})
+		.catch(() => {
+			console.warn('Failed to load PDF.js worker, PDF processing may not work');
+		});
+}
+
+/**
+ * Convert a File object to ArrayBuffer for PDF.js processing
+ * @param file - The PDF file to convert
+ * @returns Promise resolving to the file's ArrayBuffer
+ */
+async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
+	return new Promise((resolve, reject) => {
+		const reader = new FileReader();
+		reader.onload = (event) => {
+			if (event.target?.result) {
+				resolve(event.target.result as ArrayBuffer);
+			} else {
+				reject(new Error('Failed to read file.'));
+			}
+		};
+		reader.onerror = () => {
+			reject(new Error('Failed to read file.'));
+		};
+		reader.readAsArrayBuffer(file);
+	});
+}
+
+/**
+ * Extract text content from a PDF file
+ * @param file - The PDF file to process
+ * @returns Promise resolving to the extracted text content
+ */
+export async function convertPDFToText(file: File): Promise<string> {
+	if (!browser) {
+		throw new Error('PDF processing is only available in the browser');
+	}
+
+	try {
+		const buffer = await getFileAsBuffer(file);
+		const pdf = await pdfjs.getDocument(buffer).promise;
+		const numPages = pdf.numPages;
+
+		const textContentPromises: Promise<TextContent>[] = [];
+
+		for (let i = 1; i <= numPages; i++) {
+			// eslint-disable-next-line @typescript-eslint/no-explicit-any
+			textContentPromises.push(pdf.getPage(i).then((page: any) => page.getTextContent()));
+		}
+
+		const textContents = await Promise.all(textContentPromises);
+		const textItems = textContents.flatMap((textContent: TextContent) =>
+			textContent.items.map((item) => item.str ?? '')
+		);
+
+		return textItems.join('\n');
+	} catch (error) {
+		console.error('Error converting PDF to text:', error);
+		throw new Error(
+			`Failed to convert PDF to text: ${error instanceof Error ? error.message : 'Unknown error'}`
+		);
+	}
+}
+
+/**
+ * Convert PDF pages to PNG images as data URLs
+ * @param file - The PDF file to convert
+ * @param scale - Rendering scale factor (default: 1.5)
+ * @returns Promise resolving to array of PNG data URLs
+ */
+export async function convertPDFToImage(file: File, scale: number = 1.5): Promise<string[]> {
+	if (!browser) {
+		throw new Error('PDF processing is only available in the browser');
+	}
+
+	try {
+		const buffer = await getFileAsBuffer(file);
+		const doc = await pdfjs.getDocument(buffer).promise;
+		const pages: Promise<string>[] = [];
+
+		for (let i = 1; i <= doc.numPages; i++) {
+			const page = await doc.getPage(i);
+			const viewport = page.getViewport({ scale });
+			const canvas = document.createElement('canvas');
+			const ctx = canvas.getContext('2d');
+
+			canvas.width = viewport.width;
+			canvas.height = viewport.height;
+
+			if (!ctx) {
+				throw new Error('Failed to get 2D context from canvas');
+			}
+
+			const task = page.render({
+				canvasContext: ctx,
+				viewport: viewport,
+				canvas: canvas
+			});
+			pages.push(
+				task.promise.then(() => {
+					return canvas.toDataURL(MimeTypeImage.PNG);
+				})
+			);
+		}
+
+		return await Promise.all(pages);
+	} catch (error) {
+		console.error('Error converting PDF to images:', error);
+		throw new Error(
+			`Failed to convert PDF to images: ${error instanceof Error ? error.message : 'Unknown error'}`
+		);
+	}
+}
+
+/**
+ * Check if a file is a PDF based on its MIME type
+ * @param file - The file to check
+ * @returns True if the file is a PDF
+ */
+export function isPdfFile(file: File): boolean {
+	return file.type === MimeTypeApplication.PDF;
+}
+
+/**
+ * Check if a MIME type represents a PDF
+ * @param mimeType - The MIME type to check
+ * @returns True if the MIME type is application/pdf
+ */
+export function isApplicationMimeType(mimeType: string): boolean {
+	return mimeType === MimeTypeApplication.PDF;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/portal-to-body.ts b/llama.cpp/tools/server/webui/src/lib/utils/portal-to-body.ts
new file mode 100644
index 0000000..bffbe89
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/portal-to-body.ts
@@ -0,0 +1,20 @@
+export function portalToBody(node: HTMLElement) {
+	if (typeof document === 'undefined') {
+		return;
+	}
+
+	const target = document.body;
+	if (!target) {
+		return;
+	}
+
+	target.appendChild(node);
+
+	return {
+		destroy() {
+			if (node.parentNode === target) {
+				target.removeChild(node);
+			}
+		}
+	};
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/precision.ts b/llama.cpp/tools/server/webui/src/lib/utils/precision.ts
new file mode 100644
index 0000000..6da200c
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/precision.ts
@@ -0,0 +1,25 @@
+/**
+ * Floating-point precision utilities
+ *
+ * Provides functions to normalize floating-point numbers for consistent comparison
+ * and display, addressing JavaScript's floating-point precision issues.
+ */
+
+import { PRECISION_MULTIPLIER } from '$lib/constants/precision';
+
+/**
+ * Normalize floating-point numbers for consistent comparison
+ * Addresses JavaScript floating-point precision issues (e.g., 0.949999988079071 → 0.95)
+ */
+export function normalizeFloatingPoint(value: unknown): unknown {
+	return typeof value === 'number'
+		? Math.round(value * PRECISION_MULTIPLIER) / PRECISION_MULTIPLIER
+		: value;
+}
+
+/**
+ * Type-safe version that only accepts numbers
+ */
+export function normalizeNumber(value: number): number {
+	return Math.round(value * PRECISION_MULTIPLIER) / PRECISION_MULTIPLIER;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/process-uploaded-files.ts b/llama.cpp/tools/server/webui/src/lib/utils/process-uploaded-files.ts
new file mode 100644
index 0000000..0342dce
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/process-uploaded-files.ts
@@ -0,0 +1,136 @@
+import { isSvgMimeType, svgBase64UrlToPngDataURL } from './svg-to-png';
+import { isWebpMimeType, webpBase64UrlToPngDataURL } from './webp-to-png';
+import { FileTypeCategory } from '$lib/enums';
+import { modelsStore } from '$lib/stores/models.svelte';
+import { settingsStore } from '$lib/stores/settings.svelte';
+import { toast } from 'svelte-sonner';
+import { getFileTypeCategory } from '$lib/utils';
+import { convertPDFToText } from './pdf-processing';
+
+/**
+ * Read a file as a data URL (base64 encoded)
+ * @param file - The file to read
+ * @returns Promise resolving to the data URL string
+ */
+function readFileAsDataURL(file: File): Promise<string> {
+	return new Promise((resolve, reject) => {
+		const reader = new FileReader();
+		reader.onload = () => resolve(reader.result as string);
+		reader.onerror = () => reject(reader.error);
+		reader.readAsDataURL(file);
+	});
+}
+
+/**
+ * Read a file as UTF-8 text
+ * @param file - The file to read
+ * @returns Promise resolving to the text content
+ */
+function readFileAsUTF8(file: File): Promise<string> {
+	return new Promise((resolve, reject) => {
+		const reader = new FileReader();
+		reader.onload = () => resolve(reader.result as string);
+		reader.onerror = () => reject(reader.error);
+		reader.readAsText(file);
+	});
+}
+
+/**
+ * Process uploaded files into ChatUploadedFile format with previews and content
+ *
+ * This function processes various file types and generates appropriate previews:
+ * - Images: Base64 data URLs with format normalization (SVG/WebP → PNG)
+ * - Text files: UTF-8 content extraction
+ * - PDFs: Metadata only (processed later in conversion pipeline)
+ * - Audio: Base64 data URLs for preview
+ *
+ * @param files - Array of File objects to process
+ * @returns Promise resolving to array of ChatUploadedFile objects
+ */
+export async function processFilesToChatUploaded(
+	files: File[],
+	activeModelId?: string
+): Promise<ChatUploadedFile[]> {
+	const results: ChatUploadedFile[] = [];
+
+	for (const file of files) {
+		const id = Date.now().toString() + Math.random().toString(36).substr(2, 9);
+		const base: ChatUploadedFile = {
+			id,
+			name: file.name,
+			size: file.size,
+			type: file.type,
+			file
+		};
+
+		try {
+			if (getFileTypeCategory(file.type) === FileTypeCategory.IMAGE) {
+				let preview = await readFileAsDataURL(file);
+
+				// Normalize SVG and WebP to PNG in previews
+				if (isSvgMimeType(file.type)) {
+					try {
+						preview = await svgBase64UrlToPngDataURL(preview);
+					} catch (err) {
+						console.error('Failed to convert SVG to PNG:', err);
+					}
+				} else if (isWebpMimeType(file.type)) {
+					try {
+						preview = await webpBase64UrlToPngDataURL(preview);
+					} catch (err) {
+						console.error('Failed to convert WebP to PNG:', err);
+					}
+				}
+
+				results.push({ ...base, preview });
+			} else if (getFileTypeCategory(file.type) === FileTypeCategory.PDF) {
+				// Extract text content from PDF for preview
+				try {
+					const textContent = await convertPDFToText(file);
+					results.push({ ...base, textContent });
+				} catch (err) {
+					console.warn('Failed to extract text from PDF, adding without content:', err);
+					results.push(base);
+				}
+
+				// Show suggestion toast if vision model is available but PDF as image is disabled
+				const hasVisionSupport = activeModelId
+					? modelsStore.modelSupportsVision(activeModelId)
+					: false;
+				const currentConfig = settingsStore.config;
+				if (hasVisionSupport && !currentConfig.pdfAsImage) {
+					toast.info(`You can enable parsing PDF as images with vision models.`, {
+						duration: 8000,
+						action: {
+							label: 'Enable PDF as Images',
+							onClick: () => {
+								settingsStore.updateConfig('pdfAsImage', true);
+								toast.success('PDF parsing as images enabled!', {
+									duration: 3000
+								});
+							}
+						}
+					});
+				}
+			} else if (getFileTypeCategory(file.type) === FileTypeCategory.AUDIO) {
+				// Generate preview URL for audio files
+				const preview = await readFileAsDataURL(file);
+				results.push({ ...base, preview });
+			} else {
+				// Fallback: treat unknown files as text
+				try {
+					const textContent = await readFileAsUTF8(file);
+					results.push({ ...base, textContent });
+				} catch (err) {
+					console.warn('Failed to read file as text, adding without content:', err);
+					results.push(base);
+				}
+			}
+		} catch (error) {
+			console.error('Error processing file', file.name, error);
+			results.push(base);
+		}
+	}
+
+	return results;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/svg-to-png.ts b/llama.cpp/tools/server/webui/src/lib/utils/svg-to-png.ts
new file mode 100644
index 0000000..d5a7f7d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/svg-to-png.ts
@@ -0,0 +1,71 @@
+import { MimeTypeImage } from '$lib/enums';
+
+/**
+ * Convert an SVG base64 data URL to a PNG data URL
+ * @param base64UrlSvg - The SVG base64 data URL to convert
+ * @param backgroundColor - Background color for the PNG (default: 'white')
+ * @returns Promise resolving to PNG data URL
+ */
+export function svgBase64UrlToPngDataURL(
+	base64UrlSvg: string,
+	backgroundColor: string = 'white'
+): Promise<string> {
+	return new Promise((resolve, reject) => {
+		try {
+			const img = new Image();
+
+			img.onload = () => {
+				const canvas = document.createElement('canvas');
+				const ctx = canvas.getContext('2d');
+
+				if (!ctx) {
+					reject(new Error('Failed to get 2D canvas context.'));
+					return;
+				}
+
+				const targetWidth = img.naturalWidth || 300;
+				const targetHeight = img.naturalHeight || 300;
+
+				canvas.width = targetWidth;
+				canvas.height = targetHeight;
+
+				if (backgroundColor) {
+					ctx.fillStyle = backgroundColor;
+					ctx.fillRect(0, 0, canvas.width, canvas.height);
+				}
+				ctx.drawImage(img, 0, 0, targetWidth, targetHeight);
+
+				resolve(canvas.toDataURL(MimeTypeImage.PNG));
+			};
+
+			img.onerror = () => {
+				reject(new Error('Failed to load SVG image. Ensure the SVG data is valid.'));
+			};
+
+			img.src = base64UrlSvg;
+		} catch (error) {
+			const message = error instanceof Error ? error.message : String(error);
+			const errorMessage = `Error converting SVG to PNG: ${message}`;
+			console.error(errorMessage, error);
+			reject(new Error(errorMessage));
+		}
+	});
+}
+
+/**
+ * Check if a file is an SVG based on its MIME type
+ * @param file - The file to check
+ * @returns True if the file is an SVG
+ */
+export function isSvgFile(file: File): boolean {
+	return file.type === MimeTypeImage.SVG;
+}
+
+/**
+ * Check if a MIME type represents an SVG
+ * @param mimeType - The MIME type to check
+ * @returns True if the MIME type is image/svg+xml
+ */
+export function isSvgMimeType(mimeType: string): boolean {
+	return mimeType === MimeTypeImage.SVG;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/syntax-highlight-language.ts b/llama.cpp/tools/server/webui/src/lib/utils/syntax-highlight-language.ts
new file mode 100644
index 0000000..5384291
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/syntax-highlight-language.ts
@@ -0,0 +1,145 @@
+/**
+ * Maps file extensions to highlight.js language identifiers
+ */
+export function getLanguageFromFilename(filename: string): string {
+	const extension = filename.toLowerCase().substring(filename.lastIndexOf('.'));
+
+	switch (extension) {
+		// JavaScript / TypeScript
+		case '.js':
+		case '.mjs':
+		case '.cjs':
+			return 'javascript';
+		case '.ts':
+		case '.mts':
+		case '.cts':
+			return 'typescript';
+		case '.jsx':
+			return 'javascript';
+		case '.tsx':
+			return 'typescript';
+
+		// Web
+		case '.html':
+		case '.htm':
+			return 'html';
+		case '.css':
+			return 'css';
+		case '.scss':
+			return 'scss';
+		case '.less':
+			return 'less';
+		case '.vue':
+			return 'html';
+		case '.svelte':
+			return 'html';
+
+		// Data formats
+		case '.json':
+			return 'json';
+		case '.xml':
+			return 'xml';
+		case '.yaml':
+		case '.yml':
+			return 'yaml';
+		case '.toml':
+			return 'ini';
+		case '.csv':
+			return 'plaintext';
+
+		// Programming languages
+		case '.py':
+			return 'python';
+		case '.java':
+			return 'java';
+		case '.kt':
+		case '.kts':
+			return 'kotlin';
+		case '.scala':
+			return 'scala';
+		case '.cpp':
+		case '.cc':
+		case '.cxx':
+		case '.c++':
+			return 'cpp';
+		case '.c':
+			return 'c';
+		case '.h':
+		case '.hpp':
+			return 'cpp';
+		case '.cs':
+			return 'csharp';
+		case '.go':
+			return 'go';
+		case '.rs':
+			return 'rust';
+		case '.rb':
+			return 'ruby';
+		case '.php':
+			return 'php';
+		case '.swift':
+			return 'swift';
+		case '.dart':
+			return 'dart';
+		case '.r':
+			return 'r';
+		case '.lua':
+			return 'lua';
+		case '.pl':
+		case '.pm':
+			return 'perl';
+
+		// Shell
+		case '.sh':
+		case '.bash':
+		case '.zsh':
+			return 'bash';
+		case '.bat':
+		case '.cmd':
+			return 'dos';
+		case '.ps1':
+			return 'powershell';
+
+		// Database
+		case '.sql':
+			return 'sql';
+
+		// Markup / Documentation
+		case '.md':
+		case '.markdown':
+			return 'markdown';
+		case '.tex':
+		case '.latex':
+			return 'latex';
+		case '.adoc':
+		case '.asciidoc':
+			return 'asciidoc';
+
+		// Config
+		case '.ini':
+		case '.cfg':
+		case '.conf':
+			return 'ini';
+		case '.dockerfile':
+			return 'dockerfile';
+		case '.nginx':
+			return 'nginx';
+
+		// Other
+		case '.graphql':
+		case '.gql':
+			return 'graphql';
+		case '.proto':
+			return 'protobuf';
+		case '.diff':
+		case '.patch':
+			return 'diff';
+		case '.log':
+			return 'plaintext';
+		case '.txt':
+			return 'plaintext';
+
+		default:
+			return 'plaintext';
+	}
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/text-files.ts b/llama.cpp/tools/server/webui/src/lib/utils/text-files.ts
new file mode 100644
index 0000000..e8006de
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/text-files.ts
@@ -0,0 +1,97 @@
+/**
+ * Text file processing utilities
+ * Handles text file detection, reading, and validation
+ */
+
+import {
+	DEFAULT_BINARY_DETECTION_OPTIONS,
+	type BinaryDetectionOptions
+} from '$lib/constants/binary-detection';
+import { FileExtensionText } from '$lib/enums';
+
+/**
+ * Check if a filename indicates a text file based on its extension
+ * @param filename - The filename to check
+ * @returns True if the filename has a recognized text file extension
+ */
+export function isTextFileByName(filename: string): boolean {
+	const textExtensions = Object.values(FileExtensionText);
+
+	return textExtensions.some((ext: FileExtensionText) => filename.toLowerCase().endsWith(ext));
+}
+
+/**
+ * Read a file's content as text
+ * @param file - The file to read
+ * @returns Promise resolving to the file's text content
+ */
+export async function readFileAsText(file: File): Promise<string> {
+	return new Promise((resolve, reject) => {
+		const reader = new FileReader();
+
+		reader.onload = (event) => {
+			if (event.target?.result !== null && event.target?.result !== undefined) {
+				resolve(event.target.result as string);
+			} else {
+				reject(new Error('Failed to read file'));
+			}
+		};
+
+		reader.onerror = () => reject(new Error('File reading error'));
+
+		reader.readAsText(file);
+	});
+}
+
+/**
+ * Heuristic check to determine if content is likely from a text file
+ * Detects binary files by counting suspicious characters and null bytes
+ * @param content - The file content to analyze
+ * @param options - Optional configuration for detection parameters
+ * @returns True if the content appears to be text-based
+ */
+export function isLikelyTextFile(
+	content: string,
+	options: Partial<BinaryDetectionOptions> = {}
+): boolean {
+	if (!content) return true;
+
+	const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options };
+	const sample = content.substring(0, config.prefixLength);
+
+	let nullCount = 0;
+	let suspiciousControlCount = 0;
+
+	for (let i = 0; i < sample.length; i++) {
+		const charCode = sample.charCodeAt(i);
+
+		// Count null bytes - these are strong indicators of binary files
+		if (charCode === 0) {
+			nullCount++;
+
+			continue;
+		}
+
+		// Count suspicious control characters
+		// Allow common whitespace characters: tab (9), newline (10), carriage return (13)
+		if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
+			// Count most suspicious control characters
+			if (charCode < 8 || (charCode > 13 && charCode < 27)) {
+				suspiciousControlCount++;
+			}
+		}
+
+		// Count replacement characters (indicates encoding issues)
+		if (charCode === 0xfffd) {
+			suspiciousControlCount++;
+		}
+	}
+
+	// Reject if too many null bytes
+	if (nullCount > config.maxAbsoluteNullBytes) return false;
+
+	// Reject if too many suspicious characters
+	if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false;
+
+	return true;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/text.ts b/llama.cpp/tools/server/webui/src/lib/utils/text.ts
new file mode 100644
index 0000000..5c5dd0f
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/text.ts
@@ -0,0 +1,7 @@
+/**
+ * Returns a shortened preview of the provided content capped at the given length.
+ * Appends an ellipsis when the content exceeds the maximum.
+ */
+export function getPreviewText(content: string, max = 150): string {
+	return content.length > max ? content.slice(0, max) + '...' : content;
+}
diff --git a/llama.cpp/tools/server/webui/src/lib/utils/webp-to-png.ts b/llama.cpp/tools/server/webui/src/lib/utils/webp-to-png.ts
new file mode 100644
index 0000000..ea51838
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/lib/utils/webp-to-png.ts
@@ -0,0 +1,73 @@
+import { FileExtensionImage, MimeTypeImage } from '$lib/enums';
+
+/**
+ * Convert a WebP base64 data URL to a PNG data URL
+ * @param base64UrlWebp - The WebP base64 data URL to convert
+ * @param backgroundColor - Background color for the PNG (default: 'white')
+ * @returns Promise resolving to PNG data URL
+ */
+export function webpBase64UrlToPngDataURL(
+	base64UrlWebp: string,
+	backgroundColor: string = 'white'
+): Promise<string> {
+	return new Promise((resolve, reject) => {
+		try {
+			const img = new Image();
+
+			img.onload = () => {
+				const canvas = document.createElement('canvas');
+				const ctx = canvas.getContext('2d');
+
+				if (!ctx) {
+					reject(new Error('Failed to get 2D canvas context.'));
+					return;
+				}
+
+				const targetWidth = img.naturalWidth || 300;
+				const targetHeight = img.naturalHeight || 300;
+
+				canvas.width = targetWidth;
+				canvas.height = targetHeight;
+
+				if (backgroundColor) {
+					ctx.fillStyle = backgroundColor;
+					ctx.fillRect(0, 0, canvas.width, canvas.height);
+				}
+				ctx.drawImage(img, 0, 0, targetWidth, targetHeight);
+
+				resolve(canvas.toDataURL(MimeTypeImage.PNG));
+			};
+
+			img.onerror = () => {
+				reject(new Error('Failed to load WebP image. Ensure the WebP data is valid.'));
+			};
+
+			img.src = base64UrlWebp;
+		} catch (error) {
+			const message = error instanceof Error ? error.message : String(error);
+			const errorMessage = `Error converting WebP to PNG: ${message}`;
+			console.error(errorMessage, error);
+			reject(new Error(errorMessage));
+		}
+	});
+}
+
+/**
+ * Check if a file is a WebP based on its MIME type
+ * @param file - The file to check
+ * @returns True if the file is a WebP
+ */
+export function isWebpFile(file: File): boolean {
+	return (
+		file.type === MimeTypeImage.WEBP || file.name.toLowerCase().endsWith(FileExtensionImage.WEBP)
+	);
+}
+
+/**
+ * Check if a MIME type represents a WebP
+ * @param mimeType - The MIME type to check
+ * @returns True if the MIME type is image/webp
+ */
+export function isWebpMimeType(mimeType: string): boolean {
+	return mimeType === MimeTypeImage.WEBP;
+}
diff --git a/llama.cpp/tools/server/webui/src/routes/+error.svelte b/llama.cpp/tools/server/webui/src/routes/+error.svelte
new file mode 100644
index 0000000..faddf0b
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/routes/+error.svelte
@@ -0,0 +1,70 @@
+<script lang="ts">
+	import { page } from '$app/stores';
+	import { goto } from '$app/navigation';
+	import { ServerErrorSplash } from '$lib/components/app';
+
+	let error = $derived($page.error);
+	let status = $derived($page.status);
+
+	// Check if this is an API key related error
+	let isApiKeyError = $derived(
+		status === 401 ||
+			status === 403 ||
+			error?.message?.toLowerCase().includes('access denied') ||
+			error?.message?.toLowerCase().includes('unauthorized') ||
+			error?.message?.toLowerCase().includes('invalid api key')
+	);
+
+	function handleRetry() {
+		// Navigate back to home page after successful API key validation
+		goto('#/');
+	}
+</script>
+
+<svelte:head>
+	<title>Error {status} - WebUI</title>
+</svelte:head>
+
+{#if isApiKeyError}
+	<ServerErrorSplash
+		error={error?.message || 'Access denied - check server permissions'}
+		onRetry={handleRetry}
+		showRetry={false}
+		showTroubleshooting={false}
+	/>
+{:else}
+	<!-- Generic error page for non-API key errors -->
+	<div class="flex h-full items-center justify-center">
+		<div class="w-full max-w-md px-4 text-center">
+			<div class="mb-6">
+				<div
+					class="mx-auto mb-4 flex h-16 w-16 items-center justify-center rounded-full bg-destructive/10"
+				>
+					<svg
+						class="h-8 w-8 text-destructive"
+						fill="none"
+						stroke="currentColor"
+						viewBox="0 0 24 24"
+					>
+						<path
+							stroke-linecap="round"
+							stroke-linejoin="round"
+							stroke-width="2"
+							d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-2.5L13.732 4c-.77-.833-1.964-.833-2.732 0L3.732 16.5c-.77.833.192 2.5 1.732 2.5z"
+						/>
+					</svg>
+				</div>
+				<h1 class="mb-2 text-2xl font-bold">Error {status}</h1>
+				<p class="text-muted-foreground">
+					{error?.message || 'Something went wrong'}
+				</p>
+			</div>
+			<button
+				onclick={() => goto('#/')}
+				class="rounded-md bg-primary px-4 py-2 text-primary-foreground hover:bg-primary/90"
+			>
+				Go Home
+			</button>
+		</div>
+	</div>
+{/if}
diff --git a/llama.cpp/tools/server/webui/src/routes/+layout.svelte b/llama.cpp/tools/server/webui/src/routes/+layout.svelte
new file mode 100644
index 0000000..095827b
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/routes/+layout.svelte
@@ -0,0 +1,223 @@
+<script lang="ts">
+	import '../app.css';
+	import { base } from '$app/paths';
+	import { page } from '$app/state';
+	import { untrack } from 'svelte';
+	import { ChatSidebar, DialogConversationTitleUpdate } from '$lib/components/app';
+	import { isLoading } from '$lib/stores/chat.svelte';
+	import { conversationsStore, activeMessages } from '$lib/stores/conversations.svelte';
+	import * as Sidebar from '$lib/components/ui/sidebar/index.js';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import { isRouterMode, serverStore } from '$lib/stores/server.svelte';
+	import { config, settingsStore } from '$lib/stores/settings.svelte';
+	import { ModeWatcher } from 'mode-watcher';
+	import { Toaster } from 'svelte-sonner';
+	import { goto } from '$app/navigation';
+	import { modelsStore } from '$lib/stores/models.svelte';
+	import { TOOLTIP_DELAY_DURATION } from '$lib/constants/tooltip-config';
+	import { IsMobile } from '$lib/hooks/is-mobile.svelte';
+
+	let { children } = $props();
+
+	let isChatRoute = $derived(page.route.id === '/chat/[id]');
+	let isHomeRoute = $derived(page.route.id === '/');
+	let isNewChatMode = $derived(page.url.searchParams.get('new_chat') === 'true');
+	let showSidebarByDefault = $derived(activeMessages().length > 0 || isLoading());
+	let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop);
+	let autoShowSidebarOnNewChat = $derived(config().autoShowSidebarOnNewChat);
+	let isMobile = new IsMobile();
+	let isDesktop = $derived(!isMobile.current);
+	let sidebarOpen = $state(false);
+	let innerHeight = $state<number | undefined>();
+	let chatSidebar:
+		| { activateSearchMode?: () => void; editActiveConversation?: () => void }
+		| undefined = $state();
+
+	// Conversation title update dialog state
+	let titleUpdateDialogOpen = $state(false);
+	let titleUpdateCurrentTitle = $state('');
+	let titleUpdateNewTitle = $state('');
+	let titleUpdateResolve: ((value: boolean) => void) | null = null;
+
+	// Global keyboard shortcuts
+	function handleKeydown(event: KeyboardEvent) {
+		const isCtrlOrCmd = event.ctrlKey || event.metaKey;
+
+		if (isCtrlOrCmd && event.key === 'k') {
+			event.preventDefault();
+			if (chatSidebar?.activateSearchMode) {
+				chatSidebar.activateSearchMode();
+				sidebarOpen = true;
+			}
+		}
+
+		if (isCtrlOrCmd && event.shiftKey && event.key === 'O') {
+			event.preventDefault();
+			goto('?new_chat=true#/');
+		}
+
+		if (event.shiftKey && isCtrlOrCmd && event.key === 'E') {
+			event.preventDefault();
+
+			if (chatSidebar?.editActiveConversation) {
+				chatSidebar.editActiveConversation();
+			}
+		}
+	}
+
+	function handleTitleUpdateCancel() {
+		titleUpdateDialogOpen = false;
+		if (titleUpdateResolve) {
+			titleUpdateResolve(false);
+			titleUpdateResolve = null;
+		}
+	}
+
+	function handleTitleUpdateConfirm() {
+		titleUpdateDialogOpen = false;
+		if (titleUpdateResolve) {
+			titleUpdateResolve(true);
+			titleUpdateResolve = null;
+		}
+	}
+
+	$effect(() => {
+		if (alwaysShowSidebarOnDesktop && isDesktop) {
+			sidebarOpen = true;
+			return;
+		}
+
+		if (isHomeRoute && !isNewChatMode) {
+			// Auto-collapse sidebar when navigating to home route (but not in new chat mode)
+			sidebarOpen = false;
+		} else if (isHomeRoute && isNewChatMode) {
+			// Keep sidebar open in new chat mode
+			sidebarOpen = true;
+		} else if (isChatRoute) {
+			// On chat routes, only auto-show sidebar if setting is enabled
+			if (autoShowSidebarOnNewChat) {
+				sidebarOpen = true;
+			}
+			// If setting is disabled, don't change sidebar state - let user control it manually
+		} else {
+			// Other routes follow default behavior
+			sidebarOpen = showSidebarByDefault;
+		}
+	});
+
+	// Initialize server properties on app load (run once)
+	$effect(() => {
+		// Only fetch if we don't already have props
+		if (!serverStore.props) {
+			untrack(() => {
+				serverStore.fetch();
+			});
+		}
+	});
+
+	// Sync settings when server props are loaded
+	$effect(() => {
+		const serverProps = serverStore.props;
+
+		if (serverProps) {
+			settingsStore.syncWithServerDefaults();
+		}
+	});
+
+	// Fetch router models when in router mode (for status and modalities)
+	// Wait for models to be loaded first, run only once
+	let routerModelsFetched = false;
+
+	$effect(() => {
+		const isRouter = isRouterMode();
+		const modelsCount = modelsStore.models.length;
+
+		// Only fetch router models once when we have models loaded and in router mode
+		if (isRouter && modelsCount > 0 && !routerModelsFetched) {
+			routerModelsFetched = true;
+			untrack(() => {
+				modelsStore.fetchRouterModels();
+			});
+		}
+	});
+
+	// Monitor API key changes and redirect to error page if removed or changed when required
+	$effect(() => {
+		const apiKey = config().apiKey;
+
+		if (
+			(page.route.id === '/' || page.route.id === '/chat/[id]') &&
+			page.status !== 401 &&
+			page.status !== 403
+		) {
+			const headers: Record<string, string> = {
+				'Content-Type': 'application/json'
+			};
+
+			if (apiKey && apiKey.trim() !== '') {
+				headers.Authorization = `Bearer ${apiKey.trim()}`;
+			}
+
+			fetch(`${base}/props`, { headers })
+				.then((response) => {
+					if (response.status === 401 || response.status === 403) {
+						window.location.reload();
+					}
+				})
+				.catch((e) => {
+					console.error('Error checking API key:', e);
+				});
+		}
+	});
+
+	// Set up title update confirmation callback
+	$effect(() => {
+		conversationsStore.setTitleUpdateConfirmationCallback(
+			async (currentTitle: string, newTitle: string) => {
+				return new Promise<boolean>((resolve) => {
+					titleUpdateCurrentTitle = currentTitle;
+					titleUpdateNewTitle = newTitle;
+					titleUpdateResolve = resolve;
+					titleUpdateDialogOpen = true;
+				});
+			}
+		);
+	});
+</script>
+
+<Tooltip.Provider delayDuration={TOOLTIP_DELAY_DURATION}>
+	<ModeWatcher />
+
+	<Toaster richColors />
+
+	<DialogConversationTitleUpdate
+		bind:open={titleUpdateDialogOpen}
+		currentTitle={titleUpdateCurrentTitle}
+		newTitle={titleUpdateNewTitle}
+		onConfirm={handleTitleUpdateConfirm}
+		onCancel={handleTitleUpdateCancel}
+	/>
+
+	<Sidebar.Provider bind:open={sidebarOpen}>
+		<div class="flex h-screen w-full" style:height="{innerHeight}px">
+			<Sidebar.Root class="h-full">
+				<ChatSidebar bind:this={chatSidebar} />
+			</Sidebar.Root>
+
+			{#if !(alwaysShowSidebarOnDesktop && isDesktop)}
+				<Sidebar.Trigger
+					class="transition-left absolute left-0 z-[900] h-8 w-8 duration-200 ease-linear {sidebarOpen
+						? 'md:left-[var(--sidebar-width)]'
+						: ''}"
+					style="translate: 1rem 1rem;"
+				/>
+			{/if}
+
+			<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden">
+				{@render children?.()}
+			</Sidebar.Inset>
+		</div>
+	</Sidebar.Provider>
+</Tooltip.Provider>
+
+<svelte:window onkeydown={handleKeydown} bind:innerHeight />
diff --git a/llama.cpp/tools/server/webui/src/routes/+page.svelte b/llama.cpp/tools/server/webui/src/routes/+page.svelte
new file mode 100644
index 0000000..32a7c2e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/routes/+page.svelte
@@ -0,0 +1,91 @@
+<script lang="ts">
+	import { ChatScreen, DialogModelNotAvailable } from '$lib/components/app';
+	import { chatStore } from '$lib/stores/chat.svelte';
+	import { conversationsStore, isConversationsInitialized } from '$lib/stores/conversations.svelte';
+	import { modelsStore, modelOptions } from '$lib/stores/models.svelte';
+	import { onMount } from 'svelte';
+	import { page } from '$app/state';
+	import { replaceState } from '$app/navigation';
+
+	let qParam = $derived(page.url.searchParams.get('q'));
+	let modelParam = $derived(page.url.searchParams.get('model'));
+	let newChatParam = $derived(page.url.searchParams.get('new_chat'));
+
+	// Dialog state for model not available error
+	let showModelNotAvailable = $state(false);
+	let requestedModelName = $state('');
+	let availableModelNames = $derived(modelOptions().map((m) => m.model));
+
+	/**
+	 * Clear URL params after message is sent to prevent re-sending on refresh
+	 */
+	function clearUrlParams() {
+		const url = new URL(page.url);
+
+		url.searchParams.delete('q');
+		url.searchParams.delete('model');
+		url.searchParams.delete('new_chat');
+
+		replaceState(url.toString(), {});
+	}
+
+	async function handleUrlParams() {
+		await modelsStore.fetch();
+
+		if (modelParam) {
+			const model = modelsStore.findModelByName(modelParam);
+
+			if (model) {
+				try {
+					await modelsStore.selectModelById(model.id);
+				} catch (error) {
+					console.error('Failed to select model:', error);
+					requestedModelName = modelParam;
+					showModelNotAvailable = true;
+
+					return;
+				}
+			} else {
+				requestedModelName = modelParam;
+				showModelNotAvailable = true;
+
+				return;
+			}
+		}
+
+		// Handle ?q= parameter - create new conversation and send message
+		if (qParam !== null) {
+			await conversationsStore.createConversation();
+			await chatStore.sendMessage(qParam);
+			clearUrlParams();
+		} else if (modelParam || newChatParam === 'true') {
+			clearUrlParams();
+		}
+	}
+
+	onMount(async () => {
+		if (!isConversationsInitialized()) {
+			await conversationsStore.initialize();
+		}
+
+		conversationsStore.clearActiveConversation();
+		chatStore.clearUIState();
+
+		// Handle URL params only if we have ?q= or ?model= or ?new_chat=true
+		if (qParam !== null || modelParam !== null || newChatParam === 'true') {
+			await handleUrlParams();
+		}
+	});
+</script>
+
+<svelte:head>
+	<title>llama.cpp - AI Chat Interface</title>
+</svelte:head>
+
+<ChatScreen showCenteredEmpty={true} />
+
+<DialogModelNotAvailable
+	bind:open={showModelNotAvailable}
+	modelName={requestedModelName}
+	availableModels={availableModelNames}
+/>
diff --git a/llama.cpp/tools/server/webui/src/routes/+page.ts b/llama.cpp/tools/server/webui/src/routes/+page.ts
new file mode 100644
index 0000000..7905af6
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/routes/+page.ts
@@ -0,0 +1,6 @@
+import type { PageLoad } from './$types';
+import { validateApiKey } from '$lib/utils';
+
+export const load: PageLoad = async ({ fetch }) => {
+	await validateApiKey(fetch);
+};
diff --git a/llama.cpp/tools/server/webui/src/routes/chat/[id]/+page.svelte b/llama.cpp/tools/server/webui/src/routes/chat/[id]/+page.svelte
new file mode 100644
index 0000000..b897ef5
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/routes/chat/[id]/+page.svelte
@@ -0,0 +1,176 @@
+<script lang="ts">
+	import { goto, replaceState } from '$app/navigation';
+	import { page } from '$app/state';
+	import { afterNavigate } from '$app/navigation';
+	import { ChatScreen, DialogModelNotAvailable } from '$lib/components/app';
+	import { chatStore, isLoading } from '$lib/stores/chat.svelte';
+	import {
+		conversationsStore,
+		activeConversation,
+		activeMessages
+	} from '$lib/stores/conversations.svelte';
+	import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
+
+	let chatId = $derived(page.params.id);
+	let currentChatId: string | undefined = undefined;
+
+	// URL parameters for prompt and model selection
+	let qParam = $derived(page.url.searchParams.get('q'));
+	let modelParam = $derived(page.url.searchParams.get('model'));
+
+	// Dialog state for model not available error
+	let showModelNotAvailable = $state(false);
+	let requestedModelName = $state('');
+	let availableModelNames = $derived(modelOptions().map((m) => m.model));
+
+	// Track if URL params have been processed for this chat
+	let urlParamsProcessed = $state(false);
+
+	/**
+	 * Clear URL params after message is sent to prevent re-sending on refresh
+	 */
+	function clearUrlParams() {
+		const url = new URL(page.url);
+		url.searchParams.delete('q');
+		url.searchParams.delete('model');
+		replaceState(url.toString(), {});
+	}
+
+	async function handleUrlParams() {
+		// Ensure models are loaded first
+		await modelsStore.fetch();
+
+		// Handle model parameter - select model if provided
+		if (modelParam) {
+			const model = modelsStore.findModelByName(modelParam);
+			if (model) {
+				try {
+					await modelsStore.selectModelById(model.id);
+				} catch (error) {
+					console.error('Failed to select model:', error);
+					requestedModelName = modelParam;
+					showModelNotAvailable = true;
+					return;
+				}
+			} else {
+				// Model not found - show error dialog
+				requestedModelName = modelParam;
+				showModelNotAvailable = true;
+				return;
+			}
+		}
+
+		// Handle ?q= parameter - send message in current conversation
+		if (qParam !== null) {
+			await chatStore.sendMessage(qParam);
+			// Clear URL params after message is sent
+			clearUrlParams();
+		} else if (modelParam) {
+			// Clear params even if no message was sent (just model selection)
+			clearUrlParams();
+		}
+
+		urlParamsProcessed = true;
+	}
+
+	async function selectModelFromLastAssistantResponse() {
+		const messages = activeMessages();
+		if (messages.length === 0) return;
+
+		let lastMessageWithModel: DatabaseMessage | undefined;
+
+		for (let i = messages.length - 1; i >= 0; i--) {
+			if (messages[i].model) {
+				lastMessageWithModel = messages[i];
+				break;
+			}
+		}
+
+		if (!lastMessageWithModel) return;
+
+		const currentModelId = selectedModelId();
+		const currentModelName = modelOptions().find((m) => m.id === currentModelId)?.model;
+
+		if (currentModelName === lastMessageWithModel.model) {
+			return;
+		}
+
+		const matchingModel = modelOptions().find(
+			(option) => option.model === lastMessageWithModel.model
+		);
+
+		if (matchingModel) {
+			try {
+				await modelsStore.selectModelById(matchingModel.id);
+				console.log(`Automatically loaded model: ${lastMessageWithModel.model} from last message`);
+			} catch (error) {
+				console.warn('Failed to automatically select model from last message:', error);
+			}
+		}
+	}
+
+	afterNavigate(() => {
+		setTimeout(() => {
+			selectModelFromLastAssistantResponse();
+		}, 100);
+	});
+
+	$effect(() => {
+		if (chatId && chatId !== currentChatId) {
+			currentChatId = chatId;
+			urlParamsProcessed = false; // Reset for new chat
+
+			// Skip loading if this conversation is already active (e.g., just created)
+			if (activeConversation()?.id === chatId) {
+				// Still handle URL params even if conversation is active
+				if ((qParam !== null || modelParam !== null) && !urlParamsProcessed) {
+					handleUrlParams();
+				}
+				return;
+			}
+
+			(async () => {
+				const success = await conversationsStore.loadConversation(chatId);
+				if (success) {
+					chatStore.syncLoadingStateForChat(chatId);
+
+					// Handle URL params after conversation is loaded
+					if ((qParam !== null || modelParam !== null) && !urlParamsProcessed) {
+						await handleUrlParams();
+					}
+				} else {
+					await goto('#/');
+				}
+			})();
+		}
+	});
+
+	$effect(() => {
+		if (typeof window !== 'undefined') {
+			const handleBeforeUnload = () => {
+				if (isLoading()) {
+					console.log('Page unload detected while streaming - aborting stream');
+					chatStore.stopGeneration();
+				}
+			};
+
+			window.addEventListener('beforeunload', handleBeforeUnload);
+
+			return () => {
+				window.removeEventListener('beforeunload', handleBeforeUnload);
+			};
+		}
+	});
+</script>
+
+<svelte:head>
+	<title>{activeConversation()?.name || 'Chat'} - llama.cpp</title>
+</svelte:head>
+
+<ChatScreen />
+
+<DialogModelNotAvailable
+	bind:open={showModelNotAvailable}
+	modelName={requestedModelName}
+	availableModels={availableModelNames}
+/>
diff --git a/llama.cpp/tools/server/webui/src/routes/chat/[id]/+page.ts b/llama.cpp/tools/server/webui/src/routes/chat/[id]/+page.ts
new file mode 100644
index 0000000..7905af6
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/routes/chat/[id]/+page.ts
@@ -0,0 +1,6 @@
+import type { PageLoad } from './$types';
+import { validateApiKey } from '$lib/utils';
+
+export const load: PageLoad = async ({ fetch }) => {
+	await validateApiKey(fetch);
+};
diff --git a/llama.cpp/tools/server/webui/src/styles/katex-custom.scss b/llama.cpp/tools/server/webui/src/styles/katex-custom.scss
new file mode 100644
index 0000000..9c8b96e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/src/styles/katex-custom.scss
@@ -0,0 +1,13 @@
+// Override KaTeX SCSS variables to disable ttf and woff fonts
+// Only use woff2 format which is embedded in the bundle
+$use-woff2: true;
+$use-woff: false;
+$use-ttf: false;
+
+// Use Vite alias for font folder
+$font-folder: 'katex-fonts';
+
+// Import KaTeX SCSS with overridden variables
+// Note: @import is deprecated but required because KaTeX uses @import internally
+// The deprecation warnings are from KaTeX's code and cannot be avoided
+@import 'katex/src/styles/katex.scss';
diff --git a/llama.cpp/tools/server/webui/static/favicon.svg b/llama.cpp/tools/server/webui/static/favicon.svg
new file mode 100644
index 0000000..a7ae136
--- /dev/null
+++ b/llama.cpp/tools/server/webui/static/favicon.svg
@@ -0,0 +1 @@
+<svg width="256" xmlns="http://www.w3.org/2000/svg" height="256" id="screenshot-ef94fbb0-dbab-80ed-8006-89429900edbf" viewBox="0 0 256 256" xmlns:xlink="http://www.w3.org/1999/xlink" fill="none" version="1.1"><g id="shape-ef94fbb0-dbab-80ed-8006-89429900edbf" rx="0" ry="0"><g id="shape-ef94fbb0-dbab-80ed-8006-894215755c3a"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-894215755c3a"><rect rx="0" ry="0" x="0" y="0" transform="matrix(1.000000, 0.000000, 0.000000, 1.000000, 0.000000, 0.000000)" width="256" height="256" style="fill: rgb(27, 31, 32); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef3f" rx="0" ry="0"><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef40"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef40"><path d="M171.66500854492188,99.5302505493164L159.79953002929688,120.62468719482422C144.15451049804688,108.58329010009766,120.9504165649414,106.8254165649414,105.3053970336914,119.7457504272461C80.0798110961914,140.57652282714844,81.8376235961914,188.7422637939453,121.1261978149414,189.00587463378906C132.11300659179688,189.00587463378906,141.42965698242188,183.8201141357422,151.44967651367188,180.39234924316406L156.72335815429688,201.3988494873047C147.84591674804688,205.52989196777344,138.79293823242188,209.7487335205078,129.03683471679688,211.06712341308594C40.08835220336914,223.1964569091797,45.18600845336914,94.78400421142578,125.6088638305664,88.10407257080078C142.48434448242188,86.69782257080078,157.33834838867188,91.09247589111328,171.75314331054688,99.5302505493164Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef41"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef41"><path d="M110.2272720336914,79.31470489501953C96.6918716430664,83.35785675048828,84.1232681274414,90.8288345336914,74.6305923461914,101.28812408447266C72.8727798461914,80.01782989501953,77.6188735961914,37.03793716430664,101.2621841430664,28.6001033782959C104.7780532836914,27.36964988708496,116.8195571899414,24.293371200561523,116.4679946899414,30.533788681030273C116.1161880493164,36.77426528930664,107.7663345336914,47.49722671508789,105.7450942993164,53.29823684692383C102.2292251586914,63.49386978149414,105.4811782836914,70.52535247802734,110.3154067993164,79.40265655517578Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef42"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef42"><path d="M143.62692260742188,127.65621185302734L143.62692260742188,143.47706604003906L157.68991088867188,143.47706604003906L157.68991088867188,155.7821807861328L143.62692260742188,155.7821807861328L143.62692260742188,170.7240753173828L130.44284057617188,170.7240753173828L130.44284057617188,155.7821807861328L115.5009536743164,155.7821807861328L115.5009536743164,143.47706604003906L129.12448120117188,143.47706604003906L130.44284057617188,142.15867614746094L130.44284057617188,127.65621185302734L143.62692260742188,127.65621185302734Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef43"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef43"><path d="M191.96823120117188,127.65621185302734L191.96823120117188,142.15867614746094L193.28683471679688,143.47706604003906L206.91036987304688,143.47706604003906L206.91036987304688,155.7821807861328L191.96823120117188,155.7821807861328L191.96823120117188,170.7240753173828L178.78439331054688,170.7240753173828L178.78439331054688,155.7821807861328L164.72140502929688,155.7821807861328L164.72140502929688,143.47706604003906L178.78439331054688,143.47706604003906L178.78439331054688,127.65621185302734L191.96823120117188,127.65621185302734Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g><g id="shape-ef94fbb0-dbab-80ed-8006-89422363ef44"><g class="fills" id="fills-ef94fbb0-dbab-80ed-8006-89422363ef44"><path d="M153.20748901367188,38.092655181884766C154.96554565429688,40.72946548461914,145.03341674804688,52.06770706176758,143.45114135742188,54.96817398071289C138.88082885742188,63.581790924072266,141.95700073242188,68.50382232666016,145.38473510742188,76.67792510986328C135.45285034179688,75.18372344970703,126.2240982055664,76.41425323486328,116.3798599243164,77.55683135986328C118.5773696899414,58.659732818603516,129.21261596679688,31.1490535736084,153.20748901367188,38.092655181884766Z" class="st0" style="fill: rgb(255, 130, 54); fill-opacity: 1;"/></g></g></g></g></svg>
\ No newline at end of file
diff --git a/llama.cpp/tools/server/webui/static/loading.html b/llama.cpp/tools/server/webui/static/loading.html
new file mode 100644
index 0000000..c3fd19a
--- /dev/null
+++ b/llama.cpp/tools/server/webui/static/loading.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta http-equiv="refresh" content="5">
+    </head>
+    <body>
+        <div id="loading">
+            The model is loading. Please wait.<br/>
+            The user interface will appear soon.
+        </div>
+    </body>
+</html>
diff --git a/llama.cpp/tools/server/webui/svelte.config.js b/llama.cpp/tools/server/webui/svelte.config.js
new file mode 100644
index 0000000..9474993
--- /dev/null
+++ b/llama.cpp/tools/server/webui/svelte.config.js
@@ -0,0 +1,34 @@
+import { mdsvex } from 'mdsvex';
+import adapter from '@sveltejs/adapter-static';
+import { vitePreprocess } from '@sveltejs/vite-plugin-svelte';
+
+/** @type {import('@sveltejs/kit').Config} */
+const config = {
+	// Consult https://svelte.dev/docs/kit/integrations
+	// for more information about preprocessors
+	preprocess: [vitePreprocess(), mdsvex()],
+
+	kit: {
+		paths: {
+			relative: true
+		},
+		router: { type: 'hash' },
+		adapter: adapter({
+			pages: '../public',
+			assets: '../public',
+			fallback: 'index.html',
+			precompress: false,
+			strict: true
+		}),
+		output: {
+			bundleStrategy: 'inline'
+		},
+		alias: {
+			$styles: 'src/styles'
+		}
+	},
+
+	extensions: ['.svelte', '.svx']
+};
+
+export default config;
diff --git a/llama.cpp/tools/server/webui/tests/client/components/TestWrapper.svelte b/llama.cpp/tools/server/webui/tests/client/components/TestWrapper.svelte
new file mode 100644
index 0000000..4bbb8e8
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/client/components/TestWrapper.svelte
@@ -0,0 +1,17 @@
+<script lang="ts">
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import * as Sidebar from '$lib/components/ui/sidebar/index.js';
+	import Page from '../../../src/routes/+page.svelte';
+
+	let sidebarOpen = $state(false);
+</script>
+
+<!--
+	Test wrapper that provides necessary context providers for component testing.
+	This mirrors the providers from +layout.svelte.
+-->
+<Tooltip.Provider>
+	<Sidebar.Provider bind:open={sidebarOpen}>
+		<Page />
+	</Sidebar.Provider>
+</Tooltip.Provider>
diff --git a/llama.cpp/tools/server/webui/tests/client/page.svelte.test.ts b/llama.cpp/tools/server/webui/tests/client/page.svelte.test.ts
new file mode 100644
index 0000000..6849beb
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/client/page.svelte.test.ts
@@ -0,0 +1,11 @@
+import { describe, it, expect } from 'vitest';
+import { render } from 'vitest-browser-svelte';
+import TestWrapper from './components/TestWrapper.svelte';
+
+describe('/+page.svelte', () => {
+	it('should render page without throwing', async () => {
+		// Basic smoke test - page should render without throwing errors
+		// API calls will fail in test environment but component should still mount
+		expect(() => render(TestWrapper)).not.toThrow();
+	});
+});
diff --git a/llama.cpp/tools/server/webui/tests/e2e/demo.test.ts b/llama.cpp/tools/server/webui/tests/e2e/demo.test.ts
new file mode 100644
index 0000000..9985ce1
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/e2e/demo.test.ts
@@ -0,0 +1,6 @@
+import { expect, test } from '@playwright/test';
+
+test('home page has expected h1', async ({ page }) => {
+	await page.goto('/');
+	await expect(page.locator('h1')).toBeVisible();
+});
diff --git a/llama.cpp/tools/server/webui/tests/stories/ChatForm.stories.svelte b/llama.cpp/tools/server/webui/tests/stories/ChatForm.stories.svelte
new file mode 100644
index 0000000..18319e8
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/ChatForm.stories.svelte
@@ -0,0 +1,161 @@
+<script module lang="ts">
+	import { defineMeta } from '@storybook/addon-svelte-csf';
+	import ChatForm from '$lib/components/app/chat/ChatForm/ChatForm.svelte';
+	import { expect } from 'storybook/test';
+	import { mockServerProps, mockConfigs } from './fixtures/storybook-mocks';
+	import jpgAsset from './fixtures/assets/1.jpg?url';
+	import svgAsset from './fixtures/assets/hf-logo.svg?url';
+	import pdfAsset from './fixtures/assets/example.pdf?raw';
+
+	const { Story } = defineMeta({
+		title: 'Components/ChatScreen/ChatForm',
+		component: ChatForm,
+		parameters: {
+			layout: 'centered'
+		}
+	});
+
+	let fileAttachments = $state([
+		{
+			id: '1',
+			name: '1.jpg',
+			type: 'image/jpeg',
+			size: 44891,
+			preview: jpgAsset,
+			file: new File([''], '1.jpg', { type: 'image/jpeg' })
+		},
+		{
+			id: '2',
+			name: 'hf-logo.svg',
+			type: 'image/svg+xml',
+			size: 1234,
+			preview: svgAsset,
+			file: new File([''], 'hf-logo.svg', { type: 'image/svg+xml' })
+		},
+		{
+			id: '3',
+			name: 'example.pdf',
+			type: 'application/pdf',
+			size: 351048,
+			file: new File([pdfAsset], 'example.pdf', { type: 'application/pdf' })
+		}
+	]);
+</script>
+
+<Story
+	name="Default"
+	args={{ class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
+	play={async ({ canvas, userEvent }) => {
+		mockServerProps(mockConfigs.noModalities);
+
+		const textarea = await canvas.findByRole('textbox');
+		const submitButton = await canvas.findByRole('button', { name: 'Send' });
+
+		// Expect the input to be focused after the component is mounted
+		await expect(textarea).toHaveFocus();
+
+		// Expect the submit button to be disabled
+		await expect(submitButton).toBeDisabled();
+
+		const text = 'What is the meaning of life?';
+
+		await userEvent.clear(textarea);
+		await userEvent.type(textarea, text);
+
+		await expect(textarea).toHaveValue(text);
+
+		const fileInput = document.querySelector('input[type="file"]');
+		await expect(fileInput).not.toHaveAttribute('accept');
+
+		// Open file attachments dropdown
+		const fileUploadButton = canvas.getByText('Attach files');
+		await userEvent.click(fileUploadButton);
+
+		// Check dropdown menu items are disabled (no modalities)
+		const imagesButton = document.querySelector('.images-button');
+		const audioButton = document.querySelector('.audio-button');
+
+		await expect(imagesButton).toHaveAttribute('data-disabled');
+		await expect(audioButton).toHaveAttribute('data-disabled');
+
+		// Close dropdown by pressing Escape
+		await userEvent.keyboard('{Escape}');
+	}}
+/>
+
+<Story name="Loading" args={{ class: 'max-w-[56rem] w-[calc(100vw-2rem)]', isLoading: true }} />
+
+<Story
+	name="VisionModality"
+	args={{ class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
+	play={async ({ canvas, userEvent }) => {
+		mockServerProps(mockConfigs.visionOnly);
+
+		// Open file attachments dropdown and verify it works
+		const fileUploadButton = canvas.getByText('Attach files');
+		await userEvent.click(fileUploadButton);
+
+		// Verify dropdown menu items exist
+		const imagesButton = document.querySelector('.images-button');
+		const audioButton = document.querySelector('.audio-button');
+
+		await expect(imagesButton).toBeInTheDocument();
+		await expect(audioButton).toBeInTheDocument();
+
+		// Close dropdown by pressing Escape
+		await userEvent.keyboard('{Escape}');
+
+		console.log('✅ Vision modality: Dropdown menu verified');
+	}}
+/>
+
+<Story
+	name="AudioModality"
+	args={{ class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
+	play={async ({ canvas, userEvent }) => {
+		mockServerProps(mockConfigs.audioOnly);
+
+		// Open file attachments dropdown and verify it works
+		const fileUploadButton = canvas.getByText('Attach files');
+		await userEvent.click(fileUploadButton);
+
+		// Verify dropdown menu items exist
+		const imagesButton = document.querySelector('.images-button');
+		const audioButton = document.querySelector('.audio-button');
+
+		await expect(imagesButton).toBeInTheDocument();
+		await expect(audioButton).toBeInTheDocument();
+
+		// Close dropdown by pressing Escape
+		await userEvent.keyboard('{Escape}');
+
+		console.log('✅ Audio modality: Dropdown menu verified');
+	}}
+/>
+
+<Story
+	name="FileAttachments"
+	args={{
+		class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
+		uploadedFiles: fileAttachments
+	}}
+	play={async ({ canvas }) => {
+		mockServerProps(mockConfigs.bothModalities);
+
+		const jpgAttachment = canvas.getByAltText('1.jpg');
+		const svgAttachment = canvas.getByAltText('hf-logo.svg');
+		const pdfFileExtension = canvas.getByText('PDF');
+		const pdfAttachment = canvas.getByText('example.pdf');
+		const pdfSize = canvas.getByText('342.82 KB');
+
+		await expect(jpgAttachment).toBeInTheDocument();
+		await expect(jpgAttachment).toHaveAttribute('src', jpgAsset);
+
+		await expect(svgAttachment).toBeInTheDocument();
+		await expect(svgAttachment).toHaveAttribute('src', svgAsset);
+
+		await expect(pdfFileExtension).toBeInTheDocument();
+		await expect(pdfAttachment).toBeInTheDocument();
+		await expect(pdfSize).toBeInTheDocument();
+	}}
+/>
diff --git a/llama.cpp/tools/server/webui/tests/stories/ChatMessage.stories.svelte b/llama.cpp/tools/server/webui/tests/stories/ChatMessage.stories.svelte
new file mode 100644
index 0000000..5f4de7d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/ChatMessage.stories.svelte
@@ -0,0 +1,207 @@
+<script module lang="ts">
+	import { defineMeta } from '@storybook/addon-svelte-csf';
+	import ChatMessage from '$lib/components/app/chat/ChatMessages/ChatMessage.svelte';
+
+	const { Story } = defineMeta({
+		title: 'Components/ChatScreen/ChatMessage',
+		component: ChatMessage,
+		parameters: {
+			layout: 'centered'
+		}
+	});
+
+	// Mock messages for different scenarios
+	const userMessage: DatabaseMessage = {
+		id: '1',
+		convId: 'conv-1',
+		type: 'message',
+		timestamp: Date.now() - 1000 * 60 * 5,
+		role: 'user',
+		content: 'What is the meaning of life, the universe, and everything?',
+		parent: '',
+		thinking: '',
+		children: []
+	};
+
+	const assistantMessage: DatabaseMessage = {
+		id: '2',
+		convId: 'conv-1',
+		type: 'message',
+		timestamp: Date.now() - 1000 * 60 * 3,
+		role: 'assistant',
+		content:
+			'The answer to the ultimate question of life, the universe, and everything is **42**.\n\nThis comes from Douglas Adams\' "The Hitchhiker\'s Guide to the Galaxy," where a supercomputer named Deep Thought calculated this answer over 7.5 million years. However, the question itself was never properly formulated, which is why the answer seems meaningless without context.',
+		parent: '1',
+		thinking: '',
+		children: []
+	};
+
+	const assistantWithReasoning: DatabaseMessage = {
+		id: '3',
+		convId: 'conv-1',
+		type: 'message',
+		timestamp: Date.now() - 1000 * 60 * 2,
+		role: 'assistant',
+		content: "Here's the concise answer, now that I've thought it through carefully for you.",
+		parent: '1',
+		thinking:
+			"Let's consider the user's question step by step:\\n\\n1. Identify the core problem\\n2. Evaluate relevant information\\n3. Formulate a clear answer\\n\\nFollowing this process ensures the final response stays focused and accurate.",
+		children: []
+	};
+	const rawOutputMessage: DatabaseMessage = {
+		id: '6',
+		convId: 'conv-1',
+		type: 'message',
+		timestamp: Date.now() - 1000 * 60,
+		role: 'assistant',
+		content:
+			'<|channel|>analysis<|message|>User greeted me. Initiating overcomplicated analysis: Is this a trap? No, just a normal hello. Respond calmly, act like a helpful assistant, and do not start explaining quantum physics again. Confidence 0.73. Engaging socially acceptable greeting protocol...<|end|>Hello there! How can I help you today?',
+		parent: '1',
+		thinking: '',
+		children: []
+	};
+
+	let processingMessage = $state({
+		id: '4',
+		convId: 'conv-1',
+		type: 'message',
+		timestamp: 0, // No timestamp = processing
+		role: 'assistant',
+		content: '',
+		parent: '1',
+		thinking: '',
+		children: []
+	});
+
+	let streamingMessage = $state({
+		id: '5',
+		convId: 'conv-1',
+		type: 'message',
+		timestamp: 0, // No timestamp = streaming
+		role: 'assistant',
+		content: '',
+		parent: '1',
+		thinking: '',
+		children: []
+	});
+</script>
+
+<Story
+	name="User"
+	args={{
+		message: userMessage
+	}}
+	play={async () => {
+		const { settingsStore } = await import('$lib/stores/settings.svelte');
+		settingsStore.updateConfig('disableReasoningFormat', false);
+	}}
+/>
+
+<Story
+	name="Assistant"
+	args={{
+		class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
+		message: assistantMessage
+	}}
+	play={async () => {
+		const { settingsStore } = await import('$lib/stores/settings.svelte');
+		settingsStore.updateConfig('disableReasoningFormat', false);
+	}}
+/>
+
+<Story
+	name="AssistantWithReasoning"
+	args={{
+		class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
+		message: assistantWithReasoning
+	}}
+	play={async () => {
+		const { settingsStore } = await import('$lib/stores/settings.svelte');
+		settingsStore.updateConfig('disableReasoningFormat', false);
+	}}
+/>
+
+<Story
+	name="RawLlmOutput"
+	args={{
+		class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
+		message: rawOutputMessage
+	}}
+	play={async () => {
+		const { settingsStore } = await import('$lib/stores/settings.svelte');
+		settingsStore.updateConfig('disableReasoningFormat', true);
+	}}
+/>
+
+<Story
+	name="WithReasoningContent"
+	args={{
+		message: streamingMessage
+	}}
+	asChild
+	play={async () => {
+		const { settingsStore } = await import('$lib/stores/settings.svelte');
+		settingsStore.updateConfig('disableReasoningFormat', false);
+		// Phase 1: Stream reasoning content in chunks
+		let reasoningText =
+			'I need to think about this carefully. Let me break down the problem:\n\n1. The user is asking for help with something complex\n2. I should provide a thorough and helpful response\n3. I need to consider multiple approaches\n4. The best solution would be to explain step by step\n\nThis approach will ensure clarity and understanding.';
+
+		let reasoningChunk = 'I';
+		let i = 0;
+		while (i < reasoningText.length) {
+			const chunkSize = Math.floor(Math.random() * 5) + 3; // Random 3-7 characters
+			const chunk = reasoningText.slice(i, i + chunkSize);
+			reasoningChunk += chunk;
+
+			// Update the reactive state directly
+			streamingMessage.thinking = reasoningChunk;
+
+			i += chunkSize;
+			await new Promise((resolve) => setTimeout(resolve, 50));
+		}
+
+		const regularText =
+			"Based on my analysis, here's the solution:\n\n**Step 1:** First, we need to understand the requirements clearly.\n\n**Step 2:** Then we can implement the solution systematically.\n\n**Step 3:** Finally, we test and validate the results.\n\nThis approach ensures we cover all aspects of the problem effectively.";
+
+		let contentChunk = '';
+		i = 0;
+
+		while (i < regularText.length) {
+			const chunkSize = Math.floor(Math.random() * 5) + 3; // Random 3-7 characters
+			const chunk = regularText.slice(i, i + chunkSize);
+			contentChunk += chunk;
+
+			// Update the reactive state directly
+			streamingMessage.content = contentChunk;
+
+			i += chunkSize;
+			await new Promise((resolve) => setTimeout(resolve, 50));
+		}
+
+		streamingMessage.timestamp = Date.now();
+	}}
+>
+	<div class="w-[56rem]">
+		<ChatMessage message={streamingMessage} />
+	</div>
+</Story>
+
+<Story
+	name="Processing"
+	args={{
+		message: processingMessage
+	}}
+	play={async () => {
+		const { settingsStore } = await import('$lib/stores/settings.svelte');
+		settingsStore.updateConfig('disableReasoningFormat', false);
+		// Import the chat store to simulate loading state
+		const { chatStore } = await import('$lib/stores/chat.svelte');
+
+		// Set loading state to true to trigger the processing UI
+		chatStore.isLoading = true;
+
+		// Simulate the processing state hook behavior
+		// This will show the "Generating..." text and parameter details
+		await new Promise((resolve) => setTimeout(resolve, 100));
+	}}
+/>
diff --git a/llama.cpp/tools/server/webui/tests/stories/ChatSettings.stories.svelte b/llama.cpp/tools/server/webui/tests/stories/ChatSettings.stories.svelte
new file mode 100644
index 0000000..4d8dbe5
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/ChatSettings.stories.svelte
@@ -0,0 +1,19 @@
+<script module>
+	import { defineMeta } from '@storybook/addon-svelte-csf';
+	import { ChatSettings } from '$lib/components/app';
+	import { fn } from 'storybook/test';
+
+	const { Story } = defineMeta({
+		title: 'Components/ChatSettings',
+		component: ChatSettings,
+		parameters: {
+			layout: 'fullscreen'
+		},
+		args: {
+			onClose: fn(),
+			onSave: fn()
+		}
+	});
+</script>
+
+<Story name="Default" />
diff --git a/llama.cpp/tools/server/webui/tests/stories/ChatSidebar.stories.svelte b/llama.cpp/tools/server/webui/tests/stories/ChatSidebar.stories.svelte
new file mode 100644
index 0000000..42cea87
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/ChatSidebar.stories.svelte
@@ -0,0 +1,97 @@
+<script module lang="ts">
+	import { defineMeta } from '@storybook/addon-svelte-csf';
+	import ChatSidebar from '$lib/components/app/chat/ChatSidebar/ChatSidebar.svelte';
+	import { waitFor } from 'storybook/test';
+	import { screen } from 'storybook/test';
+
+	const { Story } = defineMeta({
+		title: 'Components/ChatSidebar',
+		component: ChatSidebar,
+		parameters: {
+			layout: 'centered'
+		}
+	});
+
+	// Mock conversations for the sidebar
+	const mockConversations: DatabaseConversation[] = [
+		{
+			id: 'conv-1',
+			name: 'Getting Started with AI',
+			lastModified: Date.now() - 1000 * 60 * 5, // 5 minutes ago
+			currNode: 'msg-1'
+		},
+		{
+			id: 'conv-2',
+			name: 'Python Programming Help',
+			lastModified: Date.now() - 1000 * 60 * 60 * 2, // 2 hours ago
+			currNode: 'msg-2'
+		},
+		{
+			id: 'conv-3',
+			name: 'Creative Writing Ideas',
+			lastModified: Date.now() - 1000 * 60 * 60 * 24, // 1 day ago
+			currNode: 'msg-3'
+		},
+		{
+			id: 'conv-4',
+			name: 'This is a very long conversation title that should be truncated properly when displayed',
+			lastModified: Date.now() - 1000 * 60 * 60 * 24 * 3, // 3 days ago
+			currNode: 'msg-4'
+		},
+		{
+			id: 'conv-5',
+			name: 'Math Problem Solving',
+			lastModified: Date.now() - 1000 * 60 * 60 * 24 * 7, // 1 week ago
+			currNode: 'msg-5'
+		}
+	];
+</script>
+
+<Story
+	asChild
+	name="Default"
+	play={async () => {
+		const { conversationsStore } = await import('$lib/stores/conversations.svelte');
+		
+		waitFor(() => setTimeout(() => {
+			conversationsStore.conversations = mockConversations;
+		}, 0));
+	}}
+>
+	<div class="flex-column h-full h-screen w-72 bg-background">
+		<ChatSidebar />
+	</div>
+</Story>
+
+<Story
+	asChild
+	name="SearchActive"
+	play={async ({ userEvent }) => {
+		const { conversationsStore } = await import('$lib/stores/conversations.svelte');
+		
+		waitFor(() => setTimeout(() => {
+			conversationsStore.conversations = mockConversations;
+		}, 0));
+		
+		const searchTrigger = screen.getByText('Search conversations');
+		userEvent.click(searchTrigger);
+	}}
+>
+	<div class="flex-column h-full h-screen w-72 bg-background">
+		<ChatSidebar />
+	</div>
+</Story>
+
+<Story
+	asChild
+	name="Empty"
+	play={async () => {
+		// Mock empty conversations store
+		const { conversationsStore } = await import('$lib/stores/conversations.svelte');
+		conversationsStore.conversations = [];
+	}}
+>
+	<div class="flex-column h-full h-screen w-72 bg-background">
+		<ChatSidebar />
+	</div>
+</Story>
diff --git a/llama.cpp/tools/server/webui/tests/stories/Introduction.mdx b/llama.cpp/tools/server/webui/tests/stories/Introduction.mdx
new file mode 100644
index 0000000..19d0b28
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/Introduction.mdx
@@ -0,0 +1,44 @@
+import { Meta } from '@storybook/addon-docs/blocks';
+
+<Meta title="Introduction" />
+
+# llama.cpp Web UI
+
+Welcome to the **llama.cpp Web UI** component library! This Storybook showcases the components used in the modern web interface for the llama.cpp server.
+
+## 🚀 About This Project
+
+WebUI is a modern web interface for the llama.cpp server, built with SvelteKit and ShadCN UI. Features include:
+
+- **Real-time chat conversations** with AI assistants
+- **Multi-conversation management** with persistent storage
+- **Advanced parameter tuning** for model behavior
+- **File upload support** for multimodal interactions
+- **Responsive design** that works on desktop and mobile
+
+## 🎨 Design System
+
+The UI is built using:
+
+- **SvelteKit** - Modern web framework with excellent performance
+- **Tailwind CSS** - Utility-first CSS framework for rapid styling
+- **ShadCN/UI** - High-quality, accessible component library
+- **Lucide Icons** - Beautiful, consistent icon set
+
+## 🔧 Development
+
+This Storybook serves as both documentation and a development environment for the UI components. Each story demonstrates:
+
+- **Component variations** - Different states and configurations
+- **Interactive examples** - Live components you can interact with
+- **Usage patterns** - How components work together
+- **Styling consistency** - Unified design language
+
+## 🚀 Getting Started
+
+To explore the components:
+
+1. **Browse the sidebar** to see all available components
+2. **Click on stories** to see different component states
+3. **Use the controls panel** to interact with component props
+4. **Check the docs tab** for detailed component information
diff --git a/llama.cpp/tools/server/webui/tests/stories/MarkdownContent.stories.svelte b/llama.cpp/tools/server/webui/tests/stories/MarkdownContent.stories.svelte
new file mode 100644
index 0000000..90aa90b
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/MarkdownContent.stories.svelte
@@ -0,0 +1,130 @@
+<script module lang="ts">
+	import { defineMeta } from '@storybook/addon-svelte-csf';
+	import { expect } from 'storybook/test';
+	import { MarkdownContent } from '$lib/components/app';
+	import { AI_TUTORIAL_MD } from './fixtures/ai-tutorial.js';
+	import { API_DOCS_MD } from './fixtures/api-docs.js';
+	import { BLOG_POST_MD } from './fixtures/blog-post.js';
+	import { DATA_ANALYSIS_MD } from './fixtures/data-analysis.js';
+	import { README_MD } from './fixtures/readme.js';
+	import { MATH_FORMULAS_MD } from './fixtures/math-formulas.js';
+	import { EMPTY_MD } from './fixtures/empty.js';
+
+	const { Story } = defineMeta({
+		title: 'Components/MarkdownContent',
+		component: MarkdownContent,
+		parameters: {
+			layout: 'centered'
+		}
+	});
+</script>
+
+<Story name="Empty" args={{ content: EMPTY_MD, class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }} />
+
+<Story
+	name="AI Tutorial"
+	args={{ content: AI_TUTORIAL_MD, class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
+/>
+
+<Story
+	name="API Documentation"
+	args={{ content: API_DOCS_MD, class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
+/>
+
+<Story
+	name="Technical Blog"
+	args={{ content: BLOG_POST_MD, class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
+/>
+
+<Story
+	name="Data Analysis"
+	args={{ content: DATA_ANALYSIS_MD, class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
+/>
+
+<Story
+	name="README file"
+	args={{ content: README_MD, class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
+/>
+
+<Story
+	name="Math Formulas"
+	args={{ content: MATH_FORMULAS_MD, class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
+/>
+
+<Story
+	name="URL Links"
+	args={{
+		content: `# URL Links Test
+
+Here are some example URLs that should open in new tabs:
+
+- [Hugging Face Homepage](https://huggingface.co)
+- [GitHub Repository](https://github.com/ggml-org/llama.cpp)
+- [OpenAI Website](https://openai.com)
+- [Google Search](https://www.google.com)
+
+You can also test inline links like https://example.com or https://docs.python.org.
+
+All links should have \`target="_blank"\` and \`rel="noopener noreferrer"\` attributes for security.`,
+		class: 'max-w-[56rem] w-[calc(100vw-2rem)]'
+	}}
+	play={async ({ canvasElement }) => {
+		// Wait for component to render
+		await new Promise((resolve) => setTimeout(resolve, 100));
+
+		// Find all links in the rendered content
+		const links = canvasElement.querySelectorAll('a[href]');
+
+		// Test that we have the expected number of links
+		expect(links.length).toBeGreaterThan(0);
+
+		// Test each link for proper attributes
+		links.forEach((link) => {
+			const href = link.getAttribute('href');
+
+			// Test that external links have proper security attributes
+			if (href && (href.startsWith('http://') || href.startsWith('https://'))) {
+				expect(link.getAttribute('target')).toBe('_blank');
+				expect(link.getAttribute('rel')).toBe('noopener noreferrer');
+			}
+		});
+
+		// Test specific links exist
+		const hugginFaceLink = Array.from(links).find(
+			(link) => link.getAttribute('href') === 'https://huggingface.co'
+		);
+		expect(hugginFaceLink).toBeTruthy();
+		expect(hugginFaceLink?.textContent).toBe('Hugging Face Homepage');
+
+		const githubLink = Array.from(links).find(
+			(link) => link.getAttribute('href') === 'https://github.com/ggml-org/llama.cpp'
+		);
+		expect(githubLink).toBeTruthy();
+		expect(githubLink?.textContent).toBe('GitHub Repository');
+
+		const openaiLink = Array.from(links).find(
+			(link) => link.getAttribute('href') === 'https://openai.com'
+		);
+		expect(openaiLink).toBeTruthy();
+		expect(openaiLink?.textContent).toBe('OpenAI Website');
+
+		const googleLink = Array.from(links).find(
+			(link) => link.getAttribute('href') === 'https://www.google.com'
+		);
+		expect(googleLink).toBeTruthy();
+		expect(googleLink?.textContent).toBe('Google Search');
+
+		// Test inline links (auto-linked URLs)
+		const exampleLink = Array.from(links).find(
+			(link) => link.getAttribute('href') === 'https://example.com'
+		);
+		expect(exampleLink).toBeTruthy();
+
+		const pythonDocsLink = Array.from(links).find(
+			(link) => link.getAttribute('href') === 'https://docs.python.org'
+		);
+		expect(pythonDocsLink).toBeTruthy();
+
+		console.log(`✅ URL Links test passed - Found ${links.length} links with proper attributes`);
+	}}
+/>
diff --git a/llama.cpp/tools/server/webui/tests/stories/fixtures/ai-tutorial.ts b/llama.cpp/tools/server/webui/tests/stories/fixtures/ai-tutorial.ts
new file mode 100644
index 0000000..b3b1c24
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/fixtures/ai-tutorial.ts
@@ -0,0 +1,164 @@
+// AI Assistant Tutorial Response
+export const AI_TUTORIAL_MD = String.raw`
+# Building a Modern Chat Application with SvelteKit
+
+I'll help you create a **production-ready chat application** using SvelteKit, TypeScript, and WebSockets. This implementation includes real-time messaging, user authentication, and message persistence.
+
+## 🚀 Quick Start
+
+First, let's set up the project:
+
+${'```'}bash
+npm create svelte@latest chat-app
+cd chat-app
+npm install
+npm install socket.io socket.io-client
+npm install @prisma/client prisma
+npm run dev
+${'```'}
+
+## 📁 Project Structure
+
+${'```'}
+chat-app/
+├── src/
+│   ├── routes/
+│   │   ├── +layout.svelte
+│   │   ├── +page.svelte
+│   │   └── api/
+│   │       └── socket/+server.ts
+│   ├── lib/
+│   │   ├── components/
+│   │   │   ├── ChatMessage.svelte
+│   │   │   └── ChatInput.svelte
+│   │   └── stores/
+│   │       └── chat.ts
+│   └── app.html
+├── prisma/
+│   └── schema.prisma
+└── package.json
+${'```'}
+
+## 💻 Implementation
+
+### WebSocket Server
+
+${'```'}typescript
+// src/lib/server/socket.ts
+import { Server } from 'socket.io';
+import type { ViteDevServer } from 'vite';
+
+export function initializeSocketIO(server: ViteDevServer) {
+    const io = new Server(server.httpServer || server, {
+        cors: {
+            origin: process.env.ORIGIN || 'http://localhost:5173',
+            credentials: true
+        }
+    });
+
+    io.on('connection', (socket) => {
+        console.log('User connected:', socket.id);
+        
+        socket.on('message', async (data) => {
+            // Broadcast to all clients
+            io.emit('new-message', {
+                id: crypto.randomUUID(),
+                userId: socket.id,
+                content: data.content,
+                timestamp: new Date().toISOString()
+            });
+        });
+
+        socket.on('disconnect', () => {
+            console.log('User disconnected:', socket.id);
+        });
+    });
+
+    return io;
+}
+${'```'}
+
+### Client Store
+
+${'```'}typescript
+// src/lib/stores/chat.ts
+import { writable } from 'svelte/store';
+import io from 'socket.io-client';
+
+export interface Message {
+    id: string;
+    userId: string;
+    content: string;
+    timestamp: string;
+}
+
+function createChatStore() {
+    const { subscribe, update } = writable<Message[]>([]);
+    let socket: ReturnType<typeof io>;
+    
+    return {
+        subscribe,
+        connect: () => {
+            socket = io('http://localhost:5173');
+            
+            socket.on('new-message', (message: Message) => {
+                update(messages => [...messages, message]);
+            });
+        },
+        sendMessage: (content: string) => {
+            if (socket && content.trim()) {
+                socket.emit('message', { content });
+            }
+        }
+    };
+}
+
+export const chatStore = createChatStore();
+${'```'}
+
+## 🎯 Key Features
+
+✅ **Real-time messaging** with WebSockets  
+✅ **Message persistence** using Prisma + PostgreSQL  
+✅ **Type-safe** with TypeScript  
+✅ **Responsive UI** for all devices  
+✅ **Auto-reconnection** on connection loss  
+
+## 📊 Performance Metrics
+
+| Metric | Value |
+|--------|-------|
+| **Message Latency** | < 50ms |
+| **Concurrent Users** | 10,000+ |
+| **Messages/Second** | 5,000+ |
+| **Uptime** | 99.9% |
+
+## 🔧 Configuration
+
+### Environment Variables
+
+${'```'}env
+DATABASE_URL="postgresql://user:password@localhost:5432/chat"
+JWT_SECRET="your-secret-key"
+REDIS_URL="redis://localhost:6379"
+${'```'}
+
+## 🚢 Deployment
+
+Deploy to production using Docker:
+
+${'```'}dockerfile
+FROM node:20-alpine
+WORKDIR /app
+COPY package*.json ./
+RUN npm ci --only=production
+COPY . .
+RUN npm run build
+EXPOSE 3000
+CMD ["node", "build"]
+${'```'}
+
+---
+
+*Need help? Check the [documentation](https://kit.svelte.dev) or [open an issue](https://github.com/sveltejs/kit/issues)*
+`;
diff --git a/llama.cpp/tools/server/webui/tests/stories/fixtures/api-docs.ts b/llama.cpp/tools/server/webui/tests/stories/fixtures/api-docs.ts
new file mode 100644
index 0000000..7b49995
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/fixtures/api-docs.ts
@@ -0,0 +1,160 @@
+// API Documentation
+export const API_DOCS_MD = String.raw`
+# REST API Documentation
+
+## 🔐 Authentication
+
+All API requests require authentication using **Bearer tokens**. Include your API key in the Authorization header:
+
+${'```'}http
+GET /api/v1/users
+Host: api.example.com
+Authorization: Bearer YOUR_API_KEY
+Content-Type: application/json
+${'```'}
+
+## 📍 Endpoints
+
+### Users API
+
+#### **GET** /api/v1/users
+
+Retrieve a paginated list of users.
+
+**Query Parameters:**
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| page | integer | 1 | Page number |
+| limit | integer | 20 | Items per page |
+| sort | string | "created_at" | Sort field |
+| order | string | "desc" | Sort order |
+
+**Response:** 200 OK
+
+${'```'}json
+{
+  "data": [
+    {
+      "id": "usr_1234567890",
+      "email": "user@example.com",
+      "name": "John Doe",
+      "role": "admin",
+      "created_at": "2024-01-15T10:30:00Z"
+    }
+  ],
+  "pagination": {
+    "page": 1,
+    "limit": 20,
+    "total": 156,
+    "pages": 8
+  }
+}
+${'```'}
+
+#### **POST** /api/v1/users
+
+Create a new user account.
+
+**Request Body:**
+
+${'```'}json
+{
+  "email": "newuser@example.com",
+  "password": "SecurePassword123!",
+  "name": "Jane Smith",
+  "role": "user"
+}
+${'```'}
+
+**Response:** 201 Created
+
+${'```'}json
+{
+  "id": "usr_9876543210",
+  "email": "newuser@example.com",
+  "name": "Jane Smith",
+  "role": "user",
+  "created_at": "2024-01-21T09:15:00Z"
+}
+${'```'}
+
+### Error Responses
+
+The API returns errors in a consistent format:
+
+${'```'}json
+{
+  "error": {
+    "code": "VALIDATION_ERROR",
+    "message": "Invalid request parameters",
+    "details": [
+      {
+        "field": "email",
+        "message": "Email format is invalid"
+      }
+    ]
+  }
+}
+${'```'}
+
+### Rate Limiting
+
+| Tier | Requests/Hour | Burst |
+|------|--------------|-------|
+| **Free** | 1,000 | 100 |
+| **Pro** | 10,000 | 500 |
+| **Enterprise** | Unlimited | - |
+
+**Headers:**
+- X-RateLimit-Limit
+- X-RateLimit-Remaining  
+- X-RateLimit-Reset
+
+### Webhooks
+
+Configure webhooks to receive real-time events:
+
+${'```'}javascript
+// Webhook payload
+{
+  "event": "user.created",
+  "timestamp": "2024-01-21T09:15:00Z",
+  "data": {
+    "id": "usr_9876543210",
+    "email": "newuser@example.com"
+  },
+  "signature": "sha256=abcd1234..."
+}
+${'```'}
+
+### SDK Examples
+
+**JavaScript/TypeScript:**
+
+${'```'}typescript
+import { ApiClient } from '@example/api-sdk';
+
+const client = new ApiClient({
+  apiKey: process.env.API_KEY
+});
+
+const users = await client.users.list({
+  page: 1,
+  limit: 20
+});
+${'```'}
+
+**Python:**
+
+${'```'}python
+from example_api import Client
+
+client = Client(api_key=os.environ['API_KEY'])
+users = client.users.list(page=1, limit=20)
+${'```'}
+
+---
+
+📚 [Full API Reference](https://api.example.com/docs) | 💬 [Support](https://support.example.com)
+`;
diff --git a/llama.cpp/tools/server/webui/tests/stories/fixtures/assets/1.jpg b/llama.cpp/tools/server/webui/tests/stories/fixtures/assets/1.jpg
new file mode 100644
index 0000000..8348e38
Binary files /dev/null and b/llama.cpp/tools/server/webui/tests/stories/fixtures/assets/1.jpg differ
diff --git a/llama.cpp/tools/server/webui/tests/stories/fixtures/assets/beautiful-flowers-lotus.webp b/llama.cpp/tools/server/webui/tests/stories/fixtures/assets/beautiful-flowers-lotus.webp
new file mode 100644
index 0000000..6efcffc
Binary files /dev/null and b/llama.cpp/tools/server/webui/tests/stories/fixtures/assets/beautiful-flowers-lotus.webp differ
diff --git a/llama.cpp/tools/server/webui/tests/stories/fixtures/assets/example.pdf b/llama.cpp/tools/server/webui/tests/stories/fixtures/assets/example.pdf
new file mode 100644
index 0000000..915d301
Binary files /dev/null and b/llama.cpp/tools/server/webui/tests/stories/fixtures/assets/example.pdf differ
diff --git a/llama.cpp/tools/server/webui/tests/stories/fixtures/assets/hf-logo.svg b/llama.cpp/tools/server/webui/tests/stories/fixtures/assets/hf-logo.svg
new file mode 100644
index 0000000..d55ea22
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/fixtures/assets/hf-logo.svg
@@ -0,0 +1,8 @@
+<svg width="256" height="256" viewBox="0 0 256 256" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path d="M230.721 172.7C230.183 170.673 229.313 168.75 228.146 167.008C228.396 166.091 228.587 165.159 228.714 164.217C229.543 158.241 227.471 152.77 223.567 148.537C221.452 146.225 219.185 144.698 216.784 143.761C218.36 137.018 219.157 130.117 219.161 123.193C219.161 120.03 218.982 116.932 218.682 113.88C218.526 112.356 218.337 110.836 218.115 109.32C217.428 104.847 216.408 100.431 215.064 96.11C214.183 93.2707 213.164 90.476 212.01 87.736C210.281 83.6782 208.262 79.75 205.969 75.982C204.465 73.475 202.827 71.0508 201.062 68.72C200.197 67.543 199.296 66.3938 198.358 65.274C195.58 61.898 192.561 58.7277 189.325 55.788C188.25 54.7997 187.145 53.8453 186.01 52.926C184.893 51.9943 183.751 51.0927 182.586 50.222C180.241 48.4766 177.818 46.8392 175.324 45.315C161.543 36.945 145.382 32.145 128.109 32.145C77.817 32.145 37.057 72.907 37.057 123.196C37.055 130.208 37.867 137.196 39.477 144.02C37.317 144.958 35.247 146.42 33.327 148.535C29.424 152.766 27.351 158.217 28.18 164.193C28.306 165.142 28.495 166.082 28.747 167.006C27.5811 168.749 26.7117 170.673 26.174 172.7C24.974 177.261 25.369 181.374 26.894 184.978C25.236 189.688 25.65 194.704 27.809 199.065C29.379 202.25 31.626 204.714 34.396 206.916C37.689 209.534 41.811 211.758 46.783 213.892C52.715 216.422 59.956 218.799 63.249 219.671C71.755 221.873 79.911 223.269 88.177 223.337C99.954 223.446 110.096 220.677 117.357 213.59C120.924 214.027 124.515 214.246 128.109 214.244C131.906 214.236 135.699 213.997 139.467 213.529C146.711 220.661 156.892 223.455 168.712 223.343C176.977 223.277 185.133 221.881 193.617 219.676C196.932 218.804 204.17 216.427 210.105 213.897C215.077 211.76 219.199 209.536 222.514 206.922C225.263 204.719 227.508 202.256 229.079 199.071C231.26 194.709 231.652 189.693 230.017 184.983C231.527 181.379 231.92 177.257 230.721 172.7ZM222.281 184.673C223.952 187.844 224.059 191.427 222.585 194.764C220.349 199.821 214.795 203.805 204.008 208.082C197.3 210.742 191.158 212.443 191.104 212.458C182.232 214.759 174.208 215.928 167.262 215.928C155.76 215.928 147.201 212.754 141.773 206.486C132.594 208.05 123.222 208.103 114.026 206.644C108.591 212.808 100.081 215.928 88.676 215.928C81.729 215.928 73.706 214.759 64.833 212.458C64.779 212.443 58.639 210.742 51.929 208.082C41.143 203.805 35.587 199.824 33.352 194.764C31.878 191.427 31.985 187.844 33.656 184.673C33.81 184.378 33.976 184.091 34.153 183.813C33.1516 182.309 32.4799 180.61 32.182 178.827C31.8842 177.045 31.967 175.22 32.425 173.472C33.089 170.949 34.46 168.851 36.322 167.344C35.425 165.87 34.8365 164.23 34.592 162.522C34.056 158.808 35.289 155.1 38.062 152.076C40.222 149.723 43.275 148.428 46.655 148.428H46.745C44.1965 140.259 42.9044 131.75 42.913 123.193C42.913 76.522 80.749 38.683 127.427 38.683C174.104 38.683 211.94 76.518 211.94 123.193C211.947 131.773 210.646 140.304 208.081 148.492C208.489 148.452 208.889 148.432 209.282 148.431C212.662 148.431 215.716 149.726 217.874 152.079C220.647 155.1 221.881 158.811 221.344 162.525C221.1 164.233 220.511 165.873 219.615 167.347C221.477 168.854 222.849 170.952 223.512 173.475C223.97 175.223 224.053 177.048 223.755 178.831C223.458 180.613 222.786 182.312 221.784 183.816C221.961 184.091 222.129 184.378 222.281 184.673Z" fill="white"/>
+<path d="M221.784 183.816C222.786 182.312 223.458 180.613 223.756 178.831C224.053 177.048 223.97 175.223 223.512 173.475C222.848 170.952 221.476 168.854 219.615 167.347C220.512 165.873 221.1 164.233 221.344 162.525C221.881 158.811 220.648 155.103 217.874 152.079C215.716 149.726 212.662 148.431 209.282 148.431C208.889 148.431 208.489 148.452 208.081 148.492C210.643 140.304 211.942 131.774 211.933 123.195C211.933 76.5231 174.097 38.6851 127.424 38.6851C80.75 38.6851 42.9099 76.5191 42.9099 123.195C42.9015 131.752 44.1936 140.261 46.742 148.43H46.6519C43.2719 148.43 40.219 149.724 38.06 152.077C35.287 155.098 34.0529 158.81 34.5899 162.523C34.8346 164.231 35.4231 165.872 36.3199 167.346C34.4579 168.852 33.086 170.95 32.422 173.473C31.9642 175.222 31.8817 177.047 32.1799 178.83C32.4781 180.612 33.1501 182.312 34.1519 183.816C33.9739 184.094 33.8099 184.381 33.6549 184.676C31.9849 187.847 31.877 191.43 33.352 194.767C35.588 199.824 41.1419 203.808 51.9289 208.085C58.6359 210.745 64.779 212.446 64.833 212.461C73.705 214.762 81.729 215.931 88.675 215.931C100.081 215.931 108.591 212.811 114.026 206.647C123.222 208.106 132.594 208.052 141.773 206.489C147.201 212.757 155.76 215.931 167.262 215.931C174.208 215.931 182.232 214.762 191.103 212.461C191.158 212.446 197.298 210.745 204.008 208.085C214.795 203.808 220.35 199.824 222.585 194.767C224.059 191.43 223.952 187.847 222.281 184.676C222.129 184.379 221.961 184.091 221.784 183.816ZM110.137 196.997C109.669 197.815 109.168 198.614 108.635 199.391C107.23 201.448 105.382 203.02 103.237 204.188C99.1369 206.424 93.947 207.205 88.675 207.205C80.346 207.205 71.808 205.256 67.023 204.015C66.787 203.954 37.689 195.735 41.373 188.739C41.993 187.562 43.0129 187.092 44.2979 187.092C49.4849 187.092 58.9299 194.816 62.9889 194.816C63.8959 194.816 64.5359 194.43 64.7969 193.488C66.5269 187.284 38.5039 184.676 40.8639 175.692C41.2799 174.102 42.41 173.456 43.998 173.456C50.856 173.455 66.248 185.516 69.467 185.516C69.714 185.516 69.8909 185.443 69.9869 185.291C70.0009 185.268 70.015 185.246 70.028 185.222C71.539 182.727 70.6719 180.913 60.3209 174.573L59.3269 173.968C47.9359 167.074 39.9409 162.925 44.4879 157.975C45.0109 157.404 45.7529 157.151 46.6539 157.151C47.7219 157.151 49.0149 157.508 50.4389 158.108C56.4549 160.645 64.793 167.564 68.276 170.581C68.8239 171.057 69.3683 171.538 69.9089 172.022C69.9089 172.022 74.319 176.608 76.985 176.608C77.599 176.608 78.1199 176.366 78.4729 175.768C80.364 172.58 60.9099 157.838 59.8129 151.755C59.0689 147.634 60.3349 145.546 62.6749 145.546C63.7879 145.546 65.1459 146.02 66.6449 146.971C71.2949 149.922 80.2729 165.35 83.5599 171.352C84.6619 173.363 86.5429 174.213 88.2379 174.213C91.6009 174.213 94.2299 170.87 88.5459 166.622C80.0029 160.23 83.001 149.782 87.078 149.139C87.252 149.111 87.4279 149.097 87.6029 149.097C91.3109 149.097 92.9459 155.486 92.9459 155.486C92.9459 155.486 97.7399 167.524 105.975 175.753C113.447 183.222 114.491 189.351 110.137 196.997ZM136.766 198.407L136.339 198.458L135.611 198.541C135.228 198.581 134.844 198.619 134.459 198.654L134.084 198.688L133.741 198.717L133.255 198.756L132.718 198.795L132.182 198.83L132.063 198.838C131.923 198.846 131.783 198.855 131.641 198.862L131.462 198.872C131.296 198.881 131.13 198.889 130.962 198.896L130.381 198.921L129.854 198.939L129.502 198.949H129.323C129.213 198.949 129.104 198.955 128.994 198.956H128.82C128.71 198.956 128.601 198.956 128.491 198.961L128.043 198.967H127.418C126.927 198.967 126.437 198.962 125.949 198.952L125.553 198.943C125.44 198.943 125.327 198.938 125.216 198.934L124.796 198.922L124.275 198.902L123.805 198.881L123.684 198.876L123.237 198.853C123.112 198.846 122.989 198.84 122.865 198.831L122.576 198.814C122.213 198.791 121.85 198.766 121.487 198.738L121.107 198.707C120.947 198.695 120.787 198.68 120.628 198.666C120.441 198.65 120.254 198.632 120.067 198.614C119.754 198.585 119.441 198.553 119.128 198.519H119.113C123.683 188.324 121.372 178.802 112.137 169.575C106.08 163.526 102.051 154.594 101.215 152.633C99.5229 146.828 95.045 140.375 87.608 140.375C86.979 140.375 86.351 140.425 85.73 140.523C82.472 141.036 79.624 142.911 77.592 145.733C75.396 143.002 73.262 140.831 71.332 139.605C68.422 137.76 65.5179 136.824 62.6889 136.824C59.1579 136.824 56.0019 138.274 53.8019 140.904L53.7459 140.971C53.7039 140.798 53.6639 140.625 53.6229 140.451L53.6179 140.428C53.1992 138.638 52.8477 136.833 52.5639 135.016C52.5639 135.004 52.5639 134.992 52.5579 134.98C52.5359 134.843 52.5159 134.705 52.4949 134.568C52.4334 134.162 52.3757 133.755 52.3219 133.348C52.2979 133.163 52.2719 132.978 52.2489 132.793L52.1809 132.238C52.1589 132.053 52.1409 131.885 52.1209 131.709L52.115 131.665C52.0351 130.945 51.9651 130.225 51.9049 129.503L51.8829 129.226L51.8479 128.754C51.8379 128.625 51.8279 128.495 51.8209 128.365C51.8209 128.334 51.8159 128.304 51.8149 128.275C51.7895 127.913 51.7678 127.55 51.7499 127.187C51.7399 126.998 51.7299 126.81 51.7219 126.62L51.7019 126.124L51.6969 125.974L51.6809 125.517L51.6709 125.128C51.6709 124.973 51.6629 124.818 51.6609 124.663C51.6579 124.508 51.6539 124.338 51.6529 124.174C51.6509 124.01 51.6529 123.848 51.6479 123.685C51.6439 123.521 51.6479 123.358 51.6479 123.195C51.6479 81.3421 85.5789 47.4111 127.436 47.4111C169.292 47.4111 203.222 81.3411 203.222 123.195V124.174C203.222 124.337 203.217 124.501 203.214 124.663C203.214 124.798 203.208 124.931 203.204 125.068C203.204 125.188 203.199 125.309 203.195 125.425C203.195 125.578 203.186 125.731 203.181 125.884V125.896L203.16 126.427C203.153 126.582 203.147 126.738 203.139 126.893L203.134 127.003L203.107 127.499C203.048 128.562 202.967 129.623 202.866 130.683V130.696C202.849 130.87 202.832 131.044 202.813 131.218L202.768 131.629L202.679 132.433L202.628 132.84L202.565 133.319C202.542 133.493 202.519 133.668 202.493 133.841C202.467 134.036 202.438 134.23 202.409 134.424L202.34 134.883L202.258 135.403C202.23 135.576 202.2 135.748 202.168 135.92C202.135 136.093 202.109 136.265 202.079 136.437C202.019 136.781 201.956 137.125 201.89 137.468C201.789 137.981 201.686 138.493 201.58 139.005L201.47 139.512C201.434 139.681 201.395 139.851 201.357 140.02C199.224 137.947 196.399 136.818 193.284 136.818C190.457 136.818 187.55 137.753 184.641 139.598C182.711 140.824 180.578 142.996 178.381 145.726C176.346 142.904 173.498 141.029 170.242 140.516C169.621 140.418 168.993 140.368 168.364 140.368C160.925 140.368 156.45 146.821 154.757 152.626C153.917 154.587 149.887 163.519 143.825 169.577C134.596 178.775 132.268 188.254 136.766 198.407ZM215.007 177.998L214.977 178.087C214.901 178.288 214.813 178.484 214.714 178.674C214.639 178.814 214.558 178.95 214.47 179.082C214.303 179.331 214.12 179.569 213.921 179.793C213.875 179.845 213.831 179.897 213.779 179.948C213.707 180.025 213.634 180.101 213.559 180.175C212.213 181.509 210.161 182.679 207.841 183.752C207.578 183.871 207.311 183.99 207.042 184.11L206.774 184.229C206.595 184.308 206.416 184.386 206.228 184.463C206.049 184.541 205.863 184.619 205.677 184.695L205.119 184.925C203.814 185.462 202.477 185.974 201.173 186.479L200.615 186.696L200.064 186.912C199.697 187.055 199.335 187.198 198.979 187.341L198.448 187.555L197.926 187.768L197.67 187.876C197.499 187.947 197.332 188.018 197.165 188.089C193.328 189.736 190.567 191.411 191.147 193.489C191.163 193.548 191.181 193.604 191.201 193.659C191.253 193.813 191.324 193.958 191.413 194.095C191.465 194.176 191.525 194.253 191.592 194.323C192.274 195.032 193.515 194.92 195.08 194.357C195.3 194.276 195.519 194.192 195.736 194.104L195.872 194.048C196.23 193.896 196.609 193.726 196.996 193.542C197.093 193.496 197.191 193.452 197.289 193.401C199.203 192.465 201.372 191.205 203.524 190.058C204.385 189.593 205.258 189.152 206.142 188.733C208.18 187.774 210.096 187.094 211.636 187.094C212.359 187.094 212.997 187.242 213.529 187.582L213.618 187.641C213.952 187.876 214.232 188.178 214.441 188.528C214.482 188.595 214.522 188.666 214.561 188.739C215.322 190.184 214.685 191.68 213.194 193.147C211.763 194.556 209.537 195.937 207.007 197.215C206.819 197.31 206.631 197.405 206.44 197.498C198.91 201.196 189.049 203.981 188.912 204.016C186.284 204.697 182.526 205.591 178.292 206.26L177.666 206.358L177.563 206.373C177.089 206.445 176.614 206.512 176.138 206.574C175.655 206.639 175.167 206.698 174.676 206.753L174.586 206.763C172.806 206.968 171.019 207.104 169.228 207.169H169.202C168.554 207.192 167.907 207.204 167.259 207.204H166.512C165.524 207.191 164.538 207.146 163.553 207.07C163.53 207.07 163.505 207.07 163.482 207.064C163.129 207.037 162.777 207.004 162.425 206.965C162.06 206.926 161.696 206.882 161.333 206.833C161.094 206.801 160.856 206.765 160.618 206.726C160.376 206.687 160.134 206.647 159.893 206.605L159.564 206.543L159.539 206.538C159.192 206.472 158.847 206.399 158.503 206.319C158.303 206.274 158.104 206.23 157.907 206.176L157.788 206.146C157.69 206.122 157.595 206.096 157.498 206.07L157.445 206.056L157.137 205.966C157.025 205.935 156.913 205.901 156.801 205.868L156.762 205.857L156.471 205.768C156.361 205.734 156.251 205.698 156.142 205.662L155.874 205.573L155.677 205.504C155.487 205.437 155.298 205.368 155.111 205.296L154.933 205.226L154.786 205.168C154.502 205.054 154.22 204.935 153.941 204.81L153.756 204.72L153.725 204.706C153.659 204.675 153.594 204.644 153.528 204.617C153.399 204.555 153.271 204.491 153.144 204.426L153.105 204.407L152.921 204.31C152.594 204.139 152.274 203.957 151.96 203.764L151.788 203.658C151.702 203.605 151.616 203.55 151.532 203.494L151.308 203.346L151.067 203.18L150.923 203.077C150.771 202.969 150.622 202.857 150.476 202.742L150.243 202.563C150.15 202.488 150.058 202.412 149.967 202.335C149.89 202.272 149.815 202.206 149.74 202.14L149.734 202.135C149.653 202.064 149.574 201.993 149.495 201.92C149.417 201.849 149.339 201.777 149.263 201.704L149.254 201.695C149.174 201.619 149.096 201.542 149.019 201.463C148.942 201.385 148.863 201.307 148.788 201.227C148.713 201.148 148.636 201.067 148.562 200.984C148.488 200.902 148.42 200.827 148.35 200.746L148.327 200.719C148.259 200.641 148.192 200.562 148.126 200.481C147.983 200.31 147.844 200.135 147.71 199.956C147.575 199.776 147.443 199.592 147.314 199.405L147.191 199.221C147.027 198.981 146.867 198.739 146.712 198.493C146.596 198.316 146.483 198.138 146.373 197.957C146.302 197.844 146.234 197.73 146.166 197.618L146.138 197.572C146.073 197.462 146.009 197.354 145.947 197.245C145.911 197.186 145.877 197.127 145.845 197.066C145.812 197.004 145.774 196.941 145.739 196.878L145.682 196.779L145.647 196.715C145.58 196.595 145.514 196.474 145.45 196.352C145.42 196.298 145.391 196.244 145.36 196.192L145.271 196.019L145.181 195.848C144.956 195.398 144.743 194.942 144.543 194.48L144.472 194.311C144.426 194.198 144.383 194.086 144.337 193.975C144.315 193.921 144.293 193.868 144.274 193.814C144.167 193.537 144.067 193.257 143.975 192.975C143.942 192.874 143.91 192.775 143.88 192.675C143.808 192.448 143.743 192.219 143.685 191.988C143.614 191.719 143.551 191.448 143.498 191.175C143.487 191.12 143.476 191.065 143.467 191.012C143.415 190.745 143.373 190.476 143.34 190.206C143.332 190.153 143.326 190.1 143.32 190.047L143.303 189.885C143.281 189.673 143.264 189.46 143.254 189.247C143.254 189.193 143.249 189.139 143.247 189.087C143.242 188.981 143.24 188.875 143.239 188.769C143.183 184.496 145.345 180.388 149.968 175.767C158.203 167.54 162.997 155.501 162.997 155.501C162.997 155.501 163.126 154.996 163.394 154.269C163.431 154.168 163.47 154.064 163.514 153.955C163.67 153.548 163.846 153.148 164.041 152.758L164.08 152.683C164.246 152.351 164.428 152.027 164.624 151.712C164.67 151.639 164.714 151.567 164.765 151.494C164.912 151.277 165.067 151.065 165.23 150.86C165.319 150.749 165.416 150.639 165.513 150.532C165.552 150.49 165.59 150.448 165.631 150.408C166.108 149.915 166.653 149.513 167.27 149.299L167.348 149.273C167.4 149.256 167.452 149.24 167.505 149.225C167.566 149.209 167.627 149.195 167.69 149.182L167.719 149.176C167.849 149.15 167.981 149.133 168.114 149.124H168.125C168.194 149.124 168.264 149.117 168.335 149.117C168.424 149.117 168.507 149.117 168.594 149.126C168.684 149.134 168.773 149.144 168.863 149.158C169.605 149.276 170.311 149.718 170.919 150.4C171.15 150.66 171.358 150.94 171.54 151.236C171.66 151.428 171.773 151.631 171.88 151.845C171.923 151.934 171.964 152.016 172.004 152.104C172.108 152.33 172.202 152.56 172.284 152.795C172.479 153.345 172.626 153.911 172.723 154.487C172.807 154.992 172.857 155.502 172.873 156.013C172.881 156.286 172.881 156.563 172.873 156.842C172.819 158.14 172.553 159.421 172.086 160.634C172.044 160.745 171.997 160.857 171.952 160.969C171.86 161.195 171.759 161.417 171.65 161.634C171.569 161.799 171.484 161.965 171.392 162.13C171.332 162.24 171.269 162.35 171.206 162.46C171.045 162.734 170.871 163.006 170.684 163.277L170.571 163.439C170.129 164.055 169.637 164.633 169.099 165.167C168.569 165.698 168.001 166.189 167.4 166.637C166.798 167.083 166.233 167.577 165.711 168.114C164.208 169.691 163.858 171.083 164.196 172.138C164.25 172.304 164.321 172.465 164.407 172.617C164.508 172.791 164.628 172.951 164.764 173.097L164.817 173.152L164.871 173.206C164.925 173.258 164.982 173.309 165.043 173.359L165.103 173.407C165.248 173.519 165.402 173.619 165.563 173.707C165.61 173.732 165.652 173.757 165.705 173.781C165.879 173.866 166.058 173.939 166.242 173.998C166.293 174.015 166.344 174.03 166.396 174.046L166.461 174.063L166.551 174.087L166.628 174.106L166.712 174.124L166.795 174.141L166.874 174.154C166.932 174.164 166.992 174.174 167.052 174.181L167.109 174.19L167.213 174.2L167.277 174.207L167.382 174.214H167.444L167.554 174.22H167.9L167.999 174.214L168.113 174.207L168.252 174.194L168.382 174.179C168.412 174.179 168.442 174.171 168.472 174.165C168.872 174.107 169.264 174.001 169.639 173.849L169.798 173.782C169.887 173.743 169.977 173.702 170.059 173.658C170.235 173.57 170.406 173.47 170.57 173.361C170.799 173.211 171.015 173.043 171.217 172.858C171.265 172.815 171.312 172.769 171.358 172.725C171.381 172.703 171.403 172.682 171.425 172.658C171.469 172.613 171.514 172.569 171.558 172.52C171.878 172.168 172.155 171.78 172.383 171.363C174.34 167.804 176.391 164.298 178.534 160.849L178.828 160.378L179.125 159.907C179.273 159.668 179.423 159.433 179.572 159.199L179.722 158.965C180.22 158.185 180.726 157.41 181.241 156.641L181.546 156.185C182.158 155.278 182.768 154.396 183.373 153.558L183.674 153.143C184.332 152.236 185.017 151.348 185.728 150.482L186.01 150.144C186.057 150.088 186.1 150.032 186.151 149.978C186.244 149.868 186.337 149.761 186.428 149.657C186.474 149.604 186.517 149.552 186.566 149.5L186.834 149.198L186.968 149.051C187.103 148.906 187.235 148.767 187.365 148.634C187.455 148.544 187.538 148.455 187.624 148.371C188.131 147.853 188.69 147.388 189.293 146.985L189.433 146.895C189.567 146.805 189.706 146.721 189.848 146.645C192.212 145.303 194.169 145.204 195.296 146.331C195.978 147.013 196.356 148.144 196.335 149.718C196.335 149.787 196.335 149.857 196.33 149.929V150.006C196.33 150.078 196.324 150.15 196.318 150.223C196.318 150.313 196.308 150.402 196.299 150.492C196.29 150.581 196.285 150.649 196.276 150.729C196.276 150.751 196.272 150.774 196.268 150.798C196.262 150.867 196.253 150.938 196.243 151.009C196.243 151.03 196.243 151.052 196.235 151.074C196.224 151.169 196.21 151.263 196.194 151.357C196.183 151.447 196.168 151.531 196.152 151.619L196.126 151.768C196.1 151.91 196.067 152.05 196.026 152.188C195.948 152.447 195.854 152.7 195.743 152.946C195.588 153.284 195.417 153.613 195.229 153.933C195.125 154.111 195.018 154.286 194.907 154.459C194.793 154.638 194.673 154.819 194.549 155.002C194.233 155.454 193.905 155.897 193.564 156.33L193.408 156.527C192.852 157.22 192.278 157.899 191.686 158.562L191.499 158.772C191.247 159.053 190.991 159.336 190.729 159.62L190.532 159.834C190.401 159.977 190.264 160.12 190.132 160.264C190.001 160.407 189.864 160.552 189.726 160.697L189.315 161.13L188.898 161.566L188.478 162.002C188.196 162.294 187.913 162.586 187.628 162.878C183.573 167.037 179.301 171.182 177.855 173.766C177.758 173.934 177.671 174.108 177.593 174.285C177.387 174.755 177.301 175.157 177.36 175.482C177.379 175.589 177.416 175.691 177.471 175.785C177.552 175.926 177.651 176.056 177.766 176.172C177.819 176.224 177.875 176.272 177.934 176.316C178.232 176.528 178.591 176.637 178.957 176.627H179.071L179.188 176.618L179.305 176.605L179.402 176.591C179.415 176.589 179.429 176.587 179.442 176.583L179.531 176.566L179.554 176.561L179.653 176.54L179.688 176.531C179.723 176.522 179.757 176.513 179.792 176.503C179.827 176.493 179.875 176.48 179.917 176.466C180.093 176.413 180.265 176.35 180.434 176.278C180.523 176.242 180.61 176.203 180.696 176.161C180.741 176.141 180.786 176.12 180.828 176.098L180.962 176.032C181.282 175.866 181.594 175.685 181.898 175.491L182.031 175.401C182.076 175.373 182.121 175.344 182.164 175.312L182.297 175.223L182.368 175.174L182.56 175.039C182.739 174.916 182.906 174.789 183.075 174.66L183.09 174.648L183.359 174.44C183.726 174.15 184.074 173.858 184.39 173.583L184.6 173.399L184.619 173.381L184.729 173.284C184.987 173.052 185.217 172.836 185.408 172.658L185.487 172.581C185.556 172.516 185.619 172.455 185.676 172.403L185.788 172.292L185.828 172.253L185.839 172.242L185.956 172.125L186.03 172.048L186.039 172.041L186.074 172.009L186.118 171.969L186.132 171.956L186.169 171.922L186.373 171.743L186.487 171.641C186.548 171.588 186.607 171.534 186.666 171.479L186.802 171.358C186.827 171.338 186.851 171.316 186.876 171.294L187.019 171.169L187.229 170.984L187.341 170.887C187.776 170.509 188.305 170.052 188.913 169.537L189.162 169.326L189.573 168.981L189.994 168.63C190.544 168.173 191.136 167.688 191.762 167.185L192.173 166.855C192.523 166.576 192.882 166.292 193.246 166.006C193.393 165.891 193.542 165.776 193.694 165.662C194.066 165.373 194.44 165.086 194.817 164.803C195.675 164.155 196.56 163.506 197.456 162.874L197.84 162.606C198.109 162.421 198.377 162.235 198.645 162.054L198.888 161.89C199.367 161.565 199.853 161.248 200.343 160.939L200.586 160.786L200.827 160.636C201.069 160.486 201.309 160.339 201.548 160.196L201.787 160.053L202.265 159.775L202.734 159.506L202.829 159.454L203.2 159.25C203.355 159.166 203.509 159.085 203.663 159.006L203.892 158.888L204.115 158.776C204.193 158.739 204.27 158.7 204.346 158.663C204.848 158.415 205.36 158.187 205.88 157.979C206.021 157.919 206.161 157.865 206.3 157.818L206.71 157.674C206.833 157.633 206.953 157.594 207.068 157.559L207.108 157.547C207.17 157.527 207.232 157.509 207.293 157.493L207.311 157.488C207.439 157.451 207.566 157.419 207.691 157.389H207.7C208.054 157.304 208.414 157.243 208.777 157.206C208.944 157.189 209.111 157.18 209.279 157.181H209.363C209.475 157.181 209.583 157.188 209.69 157.199C209.739 157.199 209.788 157.209 209.836 157.215H209.856C209.904 157.221 209.952 157.228 210 157.239C210.047 157.248 210.095 157.256 210.141 157.267H210.156C210.203 157.277 210.245 157.289 210.294 157.303C210.548 157.374 210.79 157.484 211.012 157.628C211.121 157.699 211.223 157.779 211.317 157.868L211.344 157.894C211.362 157.91 211.379 157.927 211.395 157.944L211.444 157.997C211.846 158.418 212.178 158.901 212.428 159.427L212.466 159.517C212.551 159.717 212.618 159.924 212.666 160.135C212.808 160.781 212.753 161.455 212.508 162.07C212.415 162.318 212.302 162.557 212.169 162.785C211.858 163.309 211.489 163.796 211.07 164.237L210.981 164.332C210.848 164.472 210.71 164.612 210.565 164.752C210.501 164.815 210.434 164.877 210.367 164.94L210.162 165.129L210.055 165.224C209.797 165.454 209.532 165.677 209.263 165.893C209.1 166.025 208.936 166.154 208.77 166.281C208.184 166.729 207.587 167.161 206.979 167.578C206.612 167.83 206.242 168.077 205.869 168.321C204.95 168.924 204.021 169.512 203.083 170.084C201.115 171.294 198.934 172.588 196.609 173.995L196.007 174.36C195.348 174.762 194.726 175.146 194.14 175.512L193.845 175.697L193.287 176.055C192.917 176.292 192.548 176.531 192.179 176.77L191.882 176.966C191.737 177.06 191.593 177.156 191.449 177.252L191.308 177.342L190.876 177.633L190.647 177.79L190.379 177.976L190.13 178.149C189.713 178.444 189.325 178.725 188.968 178.992L188.834 179.094C188.624 179.253 188.416 179.415 188.211 179.58C187.902 179.829 187.62 180.067 187.367 180.296L187.243 180.409C187.172 180.474 187.102 180.539 187.035 180.603C186.989 180.648 186.946 180.693 186.898 180.736L186.834 180.8C186.691 180.944 186.551 181.091 186.416 181.242L186.35 181.318C186.203 181.488 186.075 181.651 185.963 181.81L185.913 181.881C185.825 182.009 185.744 182.141 185.671 182.277C185.652 182.311 185.635 182.345 185.618 182.379L185.569 182.481L185.536 182.555L185.515 182.605L185.498 182.65L185.475 182.711C185.413 182.88 185.37 183.056 185.345 183.234L185.337 183.296L185.331 183.354V183.669C185.331 183.695 185.331 183.721 185.338 183.749L185.343 183.797C185.343 183.823 185.349 183.848 185.353 183.876C185.357 183.902 185.364 183.949 185.372 183.986V183.991C185.379 184.026 185.386 184.06 185.395 184.095C185.404 184.13 185.413 184.17 185.424 184.206C185.443 184.277 185.467 184.347 185.492 184.417C185.508 184.459 185.523 184.5 185.54 184.541C185.54 184.549 185.546 184.558 185.55 184.566L185.586 184.647L185.636 184.758C185.69 184.873 185.749 184.985 185.813 185.094L185.879 185.208L185.947 185.322C185.959 185.341 185.973 185.359 185.988 185.376L186.01 185.399L186.035 185.422L186.061 185.442C186.099 185.469 186.14 185.49 186.183 185.505C186.206 185.513 186.23 185.519 186.254 185.525C186.831 185.655 188.017 185.178 189.593 184.346C189.682 184.298 189.78 184.248 189.875 184.196L190.355 183.934L190.589 183.804C190.756 183.715 190.926 183.614 191.1 183.515L191.417 183.336C193.5 182.137 195.988 180.597 198.56 179.093C198.801 178.952 199.043 178.811 199.285 178.672L199.771 178.361C200.335 178.038 200.902 177.719 201.471 177.404C202.188 177.01 202.91 176.626 203.639 176.254L204.115 176.013C204.431 175.857 204.744 175.705 205.053 175.557C205.651 175.273 206.256 175.003 206.868 174.748L207.203 174.612L207.243 174.596C209.018 173.893 210.627 173.459 211.929 173.459C212.21 173.456 212.492 173.48 212.769 173.528H212.778C212.867 173.544 212.948 173.562 213.031 173.582H213.046C213.259 173.636 213.466 173.713 213.662 173.812C213.937 173.954 214.184 174.143 214.393 174.371C214.489 174.477 214.574 174.592 214.649 174.714C214.789 174.929 214.899 175.162 214.978 175.406C215.01 175.501 215.038 175.594 215.067 175.693C215.278 176.45 215.257 177.253 215.007 177.998Z" fill="#FF9D00"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M203.21 123.685V123.194C203.21 81.34 169.292 47.411 127.435 47.411C85.5791 47.411 51.648 81.342 51.648 123.194V123.358C51.646 123.467 51.645 123.576 51.648 123.685C51.6529 123.848 51.6546 124.011 51.653 124.174L51.6581 124.534L51.661 124.663C51.661 124.723 51.6631 124.782 51.6651 124.842C51.6681 124.937 51.67 125.033 51.67 125.128L51.681 125.517L51.697 125.974L51.702 126.124L51.722 126.597V126.62C51.73 126.805 51.7401 126.989 51.7491 127.173L51.75 127.187C51.76 127.375 51.7701 127.564 51.7821 127.753C51.7921 127.927 51.802 128.101 51.815 128.275L51.8171 128.306C51.8258 128.455 51.8358 128.605 51.847 128.754L51.85 128.794L51.883 129.226L51.8861 129.254C51.8921 129.338 51.898 129.422 51.906 129.503C51.9658 130.224 52.0355 130.945 52.1151 131.664L52.12 131.709L52.181 132.238L52.2491 132.793L52.299 133.17L52.322 133.347C52.3753 133.755 52.433 134.162 52.495 134.568L52.4991 134.595L52.558 134.979C52.8435 136.808 53.1971 138.626 53.618 140.429L53.6231 140.451L53.655 140.586L53.746 140.971L53.802 140.904C56.002 138.274 59.158 136.824 62.689 136.824C65.519 136.824 68.4221 137.76 71.3321 139.605C73.2621 140.831 75.3961 143.002 77.5921 145.733C79.6241 142.911 82.4721 141.035 85.7301 140.523C86.3513 140.425 86.9792 140.376 87.6081 140.375C95.0441 140.375 99.523 146.828 101.215 152.633C102.051 154.594 106.08 163.526 112.156 169.568C121.392 178.795 123.703 188.316 119.132 198.511H119.148C119.459 198.546 119.772 198.578 120.087 198.607C120.274 198.625 120.46 198.643 120.648 198.659L120.714 198.665L121.127 198.7L121.507 198.73C121.869 198.758 122.232 198.784 122.596 198.807L122.885 198.824L123.114 198.838L123.256 198.846L123.703 198.869L123.825 198.874L124.294 198.895L124.816 198.915L125.235 198.927L125.305 198.929C125.394 198.933 125.483 198.936 125.572 198.936L125.668 198.939C126.258 198.953 126.847 198.96 127.437 198.959H128.063L128.51 198.954C128.62 198.949 128.729 198.949 128.84 198.949H129.014L129.165 198.945C129.224 198.943 129.283 198.941 129.343 198.941H129.522L129.873 198.932L130.401 198.914L130.982 198.888C131.15 198.882 131.316 198.873 131.482 198.865L131.661 198.854L131.927 198.84L132.083 198.831L132.201 198.823L132.738 198.788L133.274 198.749L133.761 198.71L134.103 198.681L134.479 198.647C135.107 198.591 135.733 198.525 136.359 198.45L136.786 198.399C132.287 188.247 134.616 178.767 143.813 169.577C149.876 163.519 153.905 154.587 154.745 152.625C156.438 146.821 160.914 140.368 168.352 140.368C168.981 140.368 169.61 140.418 170.231 140.516C173.486 141.028 176.334 142.904 178.369 145.726C180.566 142.996 182.699 140.823 184.63 139.597C187.539 137.753 190.445 136.817 193.272 136.817C196.388 136.817 199.212 137.947 201.345 140.02C201.384 139.851 201.422 139.682 201.459 139.512L201.568 139.006C201.607 138.821 201.646 138.636 201.683 138.451C201.749 138.124 201.815 137.797 201.878 137.467C201.944 137.125 202.007 136.781 202.067 136.437L202.098 136.251C202.117 136.141 202.135 136.031 202.156 135.92C202.19 135.748 202.218 135.576 202.246 135.402L202.257 135.336L202.328 134.883L202.398 134.424V134.42C202.449 134.081 202.497 133.742 202.542 133.403L202.553 133.319L202.616 132.841L202.667 132.433L202.757 131.629L202.792 131.306L202.801 131.218C202.82 131.044 202.838 130.87 202.854 130.696V130.682C202.867 130.544 202.881 130.405 202.893 130.266C202.964 129.478 203.024 128.686 203.072 127.891C203.081 127.761 203.088 127.63 203.096 127.499V127.493L203.122 127.002L203.128 126.892C203.144 126.56 203.158 126.228 203.169 125.896V125.884L203.174 125.754C203.179 125.645 203.183 125.535 203.183 125.425L203.185 125.381C203.189 125.278 203.193 125.172 203.193 125.067L203.196 124.977C203.199 124.872 203.202 124.768 203.202 124.663L203.204 124.574C203.207 124.441 203.21 124.307 203.21 124.174V123.685ZM108.638 199.391C114.64 190.59 114.214 183.984 105.98 175.754C97.7441 167.523 92.951 155.487 92.951 155.487C92.951 155.487 91.1621 148.496 87.0821 149.138C83.0021 149.78 80.0091 160.227 88.5521 166.622C97.0941 173.017 86.8521 177.353 83.5641 171.352C80.2761 165.35 71.299 149.923 66.645 146.972C61.991 144.021 58.718 145.675 59.815 151.757C60.36 154.776 65.4281 159.929 70.1631 164.743C74.9671 169.627 79.428 174.163 78.474 175.768C76.581 178.955 69.9141 172.023 69.9141 172.023C69.9141 172.023 49.038 153.025 44.494 157.976C40.304 162.539 46.765 166.418 56.7211 172.397C57.5671 172.905 58.4391 173.429 59.3321 173.969C70.7231 180.865 71.609 182.684 69.992 185.293C69.395 186.257 65.582 183.968 60.892 181.153C52.897 176.352 42.3551 170.023 40.8661 175.688C39.5781 180.591 47.334 183.595 54.368 186.32C60.228 188.59 65.5881 190.666 64.7991 193.484C63.9821 196.406 59.5531 193.969 54.7121 191.305C49.2771 188.314 43.3221 185.038 41.3731 188.735C37.6901 195.725 66.7831 203.954 67.0231 204.015C76.4231 206.453 100.295 211.619 108.638 199.391ZM147.303 199.391C141.301 190.59 141.727 183.984 149.962 175.754C158.197 167.523 162.99 155.487 162.99 155.487C162.99 155.487 164.779 148.496 168.859 149.138C172.939 149.78 175.932 160.227 167.39 166.622C158.847 173.017 169.089 177.353 172.377 171.352C175.666 165.35 184.637 149.923 189.291 146.972C193.945 144.021 197.22 145.675 196.122 151.757C195.578 154.776 190.509 159.929 185.774 164.744C180.97 169.628 176.509 174.163 177.462 175.768C179.355 178.955 186.027 172.019 186.027 172.019C186.027 172.019 206.902 153.022 211.448 157.973C215.637 162.535 209.176 166.415 199.219 172.394C198.348 172.917 197.478 173.441 196.609 173.966C185.218 180.862 184.332 182.681 185.948 185.289C186.546 186.254 190.359 183.964 195.048 181.149C203.044 176.349 213.586 170.019 215.075 175.685C216.364 180.588 208.607 183.592 201.573 186.317C195.713 188.587 190.353 190.663 191.141 193.481C191.957 196.402 196.385 193.965 201.225 191.301C206.66 188.31 212.616 185.032 214.564 188.732C218.248 195.726 189.15 203.947 188.915 204.007C179.515 206.453 155.643 211.619 147.303 199.391Z" fill="#FFD21E"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M152.047 102.567C153.229 102.985 154.108 104.257 154.944 105.468C156.074 107.104 157.126 108.627 158.74 107.769C160.644 106.756 162.205 105.202 163.225 103.302C164.246 101.402 164.681 99.2427 164.475 97.096C164.321 95.4908 163.813 93.9398 162.987 92.5548C162.161 91.1697 161.038 89.985 159.7 89.0862C158.361 88.1874 156.839 87.5968 155.245 87.3569C153.65 87.117 152.022 87.2339 150.478 87.699C148.934 88.1639 147.513 88.9653 146.316 90.0455C145.119 91.1257 144.176 92.4578 143.556 93.946C142.936 95.4342 142.653 97.0415 142.728 98.652C142.804 100.263 143.235 101.836 143.992 103.26C144.74 104.667 146.4 104.003 148.152 103.302C149.525 102.753 150.956 102.181 152.047 102.567ZM100.672 102.567C99.49 102.985 98.611 104.258 97.775 105.468C96.645 107.105 95.592 108.627 93.979 107.769C91.5845 106.501 89.7482 104.386 88.8278 101.838C87.9075 99.2895 87.9692 96.4896 89.0008 93.9841C90.0324 91.4786 91.9601 89.4471 94.408 88.2855C96.856 87.1239 99.6488 86.9156 102.242 87.701C104.307 88.3228 106.141 89.5427 107.513 91.2065C108.885 92.8704 109.732 94.9035 109.949 97.049C110.165 99.1945 109.74 101.356 108.728 103.26C107.979 104.667 106.319 104.003 104.567 103.303C103.193 102.753 101.764 102.181 100.672 102.567ZM144.099 149.318C152.242 142.903 155.233 132.429 155.233 125.977C155.233 120.877 151.802 122.482 146.309 125.202L145.999 125.355C140.957 127.852 134.245 131.177 126.877 131.177C119.508 131.177 112.796 127.852 107.755 125.354C102.084 122.545 98.527 120.783 98.527 125.978C98.527 132.634 101.709 143.563 110.443 149.912C111.596 147.573 113.219 145.497 115.211 143.813C117.202 142.129 119.52 140.874 122.018 140.126C122.89 139.866 123.788 141.367 124.707 142.904C125.594 144.386 126.501 145.902 127.423 145.902C128.406 145.902 129.371 144.408 130.314 142.95C131.299 141.425 132.26 139.94 133.189 140.237C137.864 141.738 141.775 144.993 144.099 149.318Z" fill="#32343D"/>
+<path d="M144.097 149.317C139.856 152.659 134.219 154.9 126.878 154.9C119.981 154.9 114.587 152.922 110.443 149.911C111.596 147.572 113.219 145.495 115.211 143.812C117.202 142.128 119.52 140.873 122.018 140.125C123.73 139.614 125.545 145.901 127.423 145.901C129.433 145.901 131.37 139.655 133.189 140.236C137.863 141.738 141.773 144.993 144.097 149.317Z" fill="#FF323D"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M81.2 111.64C80.2312 112.288 79.1173 112.687 77.9572 112.801C76.7971 112.916 75.6267 112.742 74.55 112.295C73.6893 111.94 72.9072 111.418 72.2488 110.759C71.5903 110.101 71.0684 109.319 70.713 108.458C70.267 107.381 70.0935 106.211 70.2082 105.051C70.3228 103.891 70.7219 102.777 71.37 101.808C72.1488 100.642 73.2558 99.7333 74.5512 99.1967C75.8466 98.6601 77.272 98.5197 78.6471 98.7935C80.0223 99.0672 81.2853 99.7427 82.2764 100.734C83.2675 101.726 83.9422 102.99 84.215 104.365C84.4883 105.74 84.3477 107.165 83.8113 108.46C83.2748 109.755 82.3654 110.861 81.2 111.64ZM182.613 111.64C181.644 112.288 180.53 112.687 179.37 112.801C178.209 112.916 177.039 112.742 175.962 112.295C175.101 111.939 174.319 111.418 173.661 110.759C173.003 110.101 172.481 109.319 172.125 108.458C171.68 107.381 171.507 106.211 171.621 105.051C171.736 103.891 172.135 102.777 172.782 101.808C173.364 100.936 174.133 100.205 175.032 99.6658C175.931 99.1269 176.938 98.7942 177.981 98.6917C179.025 98.5891 180.078 98.7193 181.064 99.0728C182.051 99.4264 182.947 99.9944 183.688 100.736C184.68 101.727 185.355 102.99 185.628 104.365C185.902 105.74 185.761 107.165 185.224 108.46C184.687 109.755 183.779 110.861 182.613 111.64Z" fill="#FFAD03"/>
+</svg>
\ No newline at end of file
diff --git a/llama.cpp/tools/server/webui/tests/stories/fixtures/blog-post.ts b/llama.cpp/tools/server/webui/tests/stories/fixtures/blog-post.ts
new file mode 100644
index 0000000..3eb2ed7
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/fixtures/blog-post.ts
@@ -0,0 +1,125 @@
+// Blog Post Content
+export const BLOG_POST_MD = String.raw`
+# Understanding Rust's Ownership System
+
+*Published on March 15, 2024 • 8 min read*
+
+Rust's ownership system is one of its most distinctive features, enabling memory safety without garbage collection. In this post, we'll explore how ownership works and why it's revolutionary for systems programming.
+
+## What is Ownership?
+
+Ownership is a set of rules that governs how Rust manages memory. These rules are checked at compile time, ensuring memory safety without runtime overhead.
+
+### The Three Rules of Ownership
+
+1. **Each value has a single owner**
+2. **There can only be one owner at a time**
+3. **When the owner goes out of scope, the value is dropped**
+
+## Memory Management Without GC
+
+Traditional approaches to memory management:
+
+- **Manual management** (C/C++): Error-prone, leads to bugs
+- **Garbage collection** (Java, Python): Runtime overhead
+- **Ownership** (Rust): Compile-time safety, zero runtime cost
+
+## Basic Examples
+
+### Variable Scope
+
+${'```'}rust
+fn main() {
+    let s = String::from("hello");  // s comes into scope
+    
+    // s is valid here
+    println!("{}", s);
+    
+}  // s goes out of scope and is dropped
+${'```'}
+
+### Move Semantics
+
+${'```'}rust
+fn main() {
+    let s1 = String::from("hello");
+    let s2 = s1;  // s1 is moved to s2
+    
+    // println!("{}", s1);  // ❌ ERROR: s1 is no longer valid
+    println!("{}", s2);     // ✅ OK: s2 owns the string
+}
+${'```'}
+
+## Borrowing and References
+
+Instead of transferring ownership, you can **borrow** values:
+
+### Immutable References
+
+${'```'}rust
+fn calculate_length(s: &String) -> usize {
+    s.len()  // s is a reference, doesn't own the String
+}
+
+fn main() {
+    let s1 = String::from("hello");
+    let len = calculate_length(&s1);  // Borrow s1
+    println!("Length of '{}' is {}", s1, len);  // s1 still valid
+}
+${'```'}
+
+### Mutable References
+
+${'```'}rust
+fn main() {
+    let mut s = String::from("hello");
+    
+    let r1 = &mut s;
+    r1.push_str(", world");
+    println!("{}", r1);
+    
+    // let r2 = &mut s;  // ❌ ERROR: cannot borrow twice
+}
+${'```'}
+
+## Common Pitfalls
+
+### Dangling References
+
+${'```'}rust
+fn dangle() -> &String {  // ❌ ERROR: missing lifetime specifier
+    let s = String::from("hello");
+    &s  // s will be dropped, leaving a dangling reference
+}
+${'```'}
+
+### ✅ Solution
+
+${'```'}rust
+fn no_dangle() -> String {
+    let s = String::from("hello");
+    s  // Ownership is moved out
+}
+${'```'}
+
+## Benefits
+
+- ✅ **No null pointer dereferences**
+- ✅ **No data races**
+- ✅ **No use-after-free**
+- ✅ **No memory leaks**
+
+## Conclusion
+
+Rust's ownership system eliminates entire classes of bugs at compile time. While it has a learning curve, the benefits in safety and performance are worth it.
+
+## Further Reading
+
+- [The Rust Book - Ownership](https://doc.rust-lang.org/book/ch04-00-understanding-ownership.html)
+- [Rust by Example - Ownership](https://doc.rust-lang.org/rust-by-example/scope/move.html)
+- [Rustlings Exercises](https://github.com/rust-lang/rustlings)
+
+---
+
+*Questions? Reach out on [Twitter](https://twitter.com/rustlang) or join the [Rust Discord](https://discord.gg/rust-lang)*
+`;
diff --git a/llama.cpp/tools/server/webui/tests/stories/fixtures/data-analysis.ts b/llama.cpp/tools/server/webui/tests/stories/fixtures/data-analysis.ts
new file mode 100644
index 0000000..6fec32d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/fixtures/data-analysis.ts
@@ -0,0 +1,124 @@
+// Data Analysis Report
+export const DATA_ANALYSIS_MD = String.raw`
+# Q4 2024 Business Analytics Report
+
+*Executive Summary • Generated on January 15, 2025*
+
+## 📊 Key Performance Indicators
+
+${'```'}
+Daily Active Users (DAU):   1.2M (+65% YoY)
+Monthly Active Users (MAU): 4.5M (+48% YoY)
+User Retention (Day 30):    68% (+12pp YoY)
+Average Session Duration:   24min (+35% YoY)
+${'```'}
+
+## 🎯 Product Performance
+
+### Feature Adoption Rates
+
+1. **AI Assistant**: 78% of users (↑ from 45%)
+2. **Collaboration Tools**: 62% of users (↑ from 38%)
+3. **Analytics Dashboard**: 54% of users (↑ from 31%)
+4. **Mobile App**: 41% of users (↑ from 22%)
+
+### Customer Satisfaction
+
+| Metric | Q4 2024 | Q3 2024 | Change |
+|--------|---------|---------|--------|
+| **NPS Score** | 72 | 68 | +4 |
+| **CSAT** | 4.6/5 | 4.4/5 | +0.2 |
+| **Support Tickets** | 2,340 | 2,890 | -19% |
+| **Resolution Time** | 4.2h | 5.1h | -18% |
+
+## 💰 Revenue Metrics
+
+### Monthly Recurring Revenue (MRR)
+
+- **Current MRR**: $2.8M (+42% YoY)
+- **New MRR**: $340K
+- **Expansion MRR**: $180K
+- **Churned MRR**: $95K
+- **Net New MRR**: $425K
+
+### Customer Acquisition
+
+${'```'}
+Cost per Acquisition (CAC): $127 (-23% YoY)
+Customer Lifetime Value:    $1,840 (+31% YoY)
+LTV:CAC Ratio:             14.5:1
+Payback Period:            3.2 months
+${'```'}
+
+## 🌍 Geographic Performance
+
+### Revenue by Region
+
+1. **North America**: 45% ($1.26M)
+2. **Europe**: 32% ($896K)
+3. **Asia-Pacific**: 18% ($504K)
+4. **Other**: 5% ($140K)
+
+### Growth Opportunities
+
+- **APAC**: 89% YoY growth potential
+- **Latin America**: Emerging market entry
+- **Middle East**: Enterprise expansion
+
+## 📱 Channel Performance
+
+### Traffic Sources
+
+| Channel | Sessions | Conversion | Revenue |
+|---------|----------|------------|---------|
+| **Organic Search** | 45% | 3.2% | $1.1M |
+| **Direct** | 28% | 4.1% | $850K |
+| **Social Media** | 15% | 2.8% | $420K |
+| **Paid Ads** | 12% | 5.5% | $430K |
+
+### Marketing ROI
+
+- **Content Marketing**: 340% ROI
+- **Email Campaigns**: 280% ROI
+- **Social Media**: 190% ROI
+- **Paid Search**: 220% ROI
+
+## 🔍 User Behavior Analysis
+
+### Session Patterns
+
+- **Peak Hours**: 9-11 AM, 2-4 PM EST
+- **Mobile Usage**: 67% of sessions
+- **Average Pages/Session**: 4.8
+- **Bounce Rate**: 23% (↓ from 31%)
+
+### Feature Usage Heatmap
+
+Most used features in order:
+1. Dashboard (89% of users)
+2. Search (76% of users)
+3. Reports (64% of users)
+4. Settings (45% of users)
+5. Integrations (32% of users)
+
+## 💡 Recommendations
+
+1. **Invest** in AI capabilities (+$2M budget)
+2. **Expand** sales team in APAC region
+3. **Improve** onboarding to reduce churn
+4. **Launch** enterprise security features
+
+## Appendix
+
+### Methodology
+
+Data collected from:
+- Internal analytics (Amplitude)
+- Customer surveys (n=2,450)
+- Financial systems (NetSuite)
+- Market research (Gartner)
+
+---
+
+*Report prepared by Data Analytics Team • [View Interactive Dashboard](https://analytics.example.com)*
+`;
diff --git a/llama.cpp/tools/server/webui/tests/stories/fixtures/empty.ts b/llama.cpp/tools/server/webui/tests/stories/fixtures/empty.ts
new file mode 100644
index 0000000..05286e7
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/fixtures/empty.ts
@@ -0,0 +1,2 @@
+// Empty state
+export const EMPTY_MD = '';
diff --git a/llama.cpp/tools/server/webui/tests/stories/fixtures/math-formulas.ts b/llama.cpp/tools/server/webui/tests/stories/fixtures/math-formulas.ts
new file mode 100644
index 0000000..1355256
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/fixtures/math-formulas.ts
@@ -0,0 +1,221 @@
+/* eslint-disable no-irregular-whitespace */
+// Math Formulas Content
+export const MATH_FORMULAS_MD = String.raw`
+# Mathematical Formulas and Expressions
+
+This document demonstrates various mathematical notation and formulas that can be rendered using LaTeX syntax in markdown.
+
+## Basic Arithmetic
+
+### Addition and Summation
+$$\sum_{i=1}^{n} i = \frac{n(n+1)}{2}$$
+
+## Algebra
+
+### Quadratic Formula
+The solutions to $ax^2 + bx + c = 0$ are:
+$$x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}$$
+
+### Binomial Theorem
+$$(x + y)^n = \sum_{k=0}^{n} \binom{n}{k} x^{n-k} y^k$$
+
+## Calculus
+
+### Derivatives
+The derivative of $f(x) = x^n$ is:
+$$f'(x) = nx^{n-1}$$
+
+### Integration
+$$\int_a^b f(x) \, dx = F(b) - F(a)$$
+
+### Fundamental Theorem of Calculus
+$$\frac{d}{dx} \int_a^x f(t) \, dt = f(x)$$
+
+## Linear Algebra
+
+### Matrix Multiplication
+If $A$ is an $m \times n$ matrix and $B$ is an $n \times p$ matrix, then:
+$$C_{ij} = \sum_{k=1}^{n} A_{ik} B_{kj}$$
+
+### Eigenvalues and Eigenvectors
+For a square matrix $A$, if $Av = \lambda v$ for some non-zero vector $v$, then:
+- $\lambda$ is an eigenvalue
+- $v$ is an eigenvector
+
+## Statistics and Probability
+
+### Normal Distribution
+The probability density function is:
+$$f(x) = \frac{1}{\sigma\sqrt{2\pi}} e^{-\frac{1}{2}\left(\frac{x-\mu}{\sigma}\right)^2}$$
+
+### Bayes' Theorem
+$$P(A|B) = \frac{P(B|A) \cdot P(A)}{P(B)}$$
+
+### Central Limit Theorem
+For large $n$, the sample mean $\bar{X}$ is approximately:
+$$\bar{X} \sim N\left(\mu, \frac{\sigma^2}{n}\right)$$
+
+## Trigonometry
+
+### Pythagorean Identity
+$$\sin^2\theta + \cos^2\theta = 1$$
+
+### Euler's Formula
+$$e^{i\theta} = \cos\theta + i\sin\theta$$
+
+### Taylor Series for Sine
+$$\sin x = \sum_{n=0}^{\infty} \frac{(-1)^n}{(2n+1)!} x^{2n+1} = x - \frac{x^3}{3!} + \frac{x^5}{5!} - \frac{x^7}{7!} + \cdots$$
+
+## Complex Analysis
+
+### Complex Numbers
+A complex number can be written as:
+$$z = a + bi = r e^{i\theta}$$
+
+where $r = |z| = \sqrt{a^2 + b^2}$ and $\theta = \arg(z)$
+
+### Cauchy-Riemann Equations
+For a function $f(z) = u(x,y) + iv(x,y)$ to be analytic:
+$$\frac{\partial u}{\partial x} = \frac{\partial v}{\partial y}, \quad \frac{\partial u}{\partial y} = -\frac{\partial v}{\partial x}$$
+
+## Differential Equations
+
+### First-order Linear ODE
+$$\frac{dy}{dx} + P(x)y = Q(x)$$
+
+Solution: $y = e^{-\int P(x)dx}\left[\int Q(x)e^{\int P(x)dx}dx + C\right]$
+
+### Heat Equation
+$$\frac{\partial u}{\partial t} = \alpha \frac{\partial^2 u}{\partial x^2}$$
+
+## Number Theory
+
+### Prime Number Theorem
+$$\pi(x) \sim \frac{x}{\ln x}$$
+
+where $\pi(x)$ is the number of primes less than or equal to $x$.
+
+### Fermat's Last Theorem
+For $n > 2$, there are no positive integers $a$, $b$, and $c$ such that:
+$$a^n + b^n = c^n$$
+
+## Set Theory
+
+### De Morgan's Laws
+$$\overline{A \cup B} = \overline{A} \cap \overline{B}$$
+$$\overline{A \cap B} = \overline{A} \cup \overline{B}$$
+
+## Advanced Topics
+
+### Riemann Zeta Function
+$$\zeta(s) = \sum_{n=1}^{\infty} \frac{1}{n^s} = \prod_{p \text{ prime}} \frac{1}{1-p^{-s}}$$
+
+### Maxwell's Equations
+$$\nabla \cdot \mathbf{E} = \frac{\rho}{\epsilon_0}$$
+$$\nabla \cdot \mathbf{B} = 0$$
+$$\nabla \times \mathbf{E} = -\frac{\partial \mathbf{B}}{\partial t}$$
+$$\nabla \times \mathbf{B} = \mu_0\mathbf{J} + \mu_0\epsilon_0\frac{\partial \mathbf{E}}{\partial t}$$
+
+### Schrödinger Equation
+$$i\hbar\frac{\partial}{\partial t}\Psi(\mathbf{r},t) = \hat{H}\Psi(\mathbf{r},t)$$
+
+## Inline Math Examples
+
+Here are some inline mathematical expressions:
+
+- The golden ratio: $\phi = \frac{1 + \sqrt{5}}{2} \approx 1.618$
+- Euler's number: $e = \lim_{n \to \infty} \left(1 + \frac{1}{n}\right)^n$
+- Pi: $\pi = 4 \sum_{n=0}^{\infty} \frac{(-1)^n}{2n+1}$
+- Square root of 2: $\sqrt{2} = 1.41421356...$
+
+## Fractions and Radicals
+
+Complex fraction: $\frac{\frac{a}{b} + \frac{c}{d}}{\frac{e}{f} - \frac{g}{h}}$
+
+Nested radicals: $\sqrt{2 + \sqrt{3 + \sqrt{4 + \sqrt{5}}}}$
+
+## Summations and Products
+
+### Geometric Series
+$$\sum_{n=0}^{\infty} ar^n = \frac{a}{1-r} \quad \text{for } |r| < 1$$
+
+### Product Notation
+$$n! = \prod_{k=1}^{n} k$$
+
+### Double Summation
+$$\sum_{i=1}^{m} \sum_{j=1}^{n} a_{ij}$$
+
+## Limits
+
+$$\lim_{x \to 0} \frac{\sin x}{x} = 1$$
+
+$$\lim_{n \to \infty} \left(1 + \frac{x}{n}\right)^n = e^x$$
+
+## Further Bracket Styles and Amounts
+
+-  \( \mathrm{GL}_2(\mathbb{F}_7) \): Group of invertible matrices with entries in \(\mathbb{F}_7\).
+- Some kernel of \(\mathrm{SL}_2(\mathbb{F}_7)\):
+  \[
+  \left\{ \begin{pmatrix} 1 & 0 \\ 0 & 1 \end{pmatrix}, \begin{pmatrix} -1 & 0 \\ 0 & -1 \end{pmatrix} \right\} = \{\pm I\}
+  \]
+- Algebra:
+\[
+x = \frac{-b \pm \sqrt{\,b^{2}-4ac\,}}{2a}
+\]
+- $100 and $12.99 are amounts, not LaTeX.
+- I have $10, $3.99 and $x + y$ and $100x$. The amount is $2,000.
+- Emma buys 2 cupcakes for $3 each and 1 cookie for $1.50. How much money does she spend in total?
+- Maria has $20. She buys a notebook for $4.75 and a pack of pencils for $3.25. How much change does she receive?
+- 1 kg の質量は
+  \[
+  E = (1\ \text{kg}) \times (3.0 \times 10^8\ \text{m/s})^2 \approx 9.0 \times 10^{16}\ \text{J}
+  \]
+  というエネルギーに相当します。これは約 21 百万トンの TNT が爆発したときのエネルギーに匹敵します。
+- Algebra: \[
+x = \frac{-b \pm \sqrt{\,b^{2}-4ac\,}}{2a}
+\]
+- Algebraic topology, Homotopy Groups of $\mathbb{S}^3$:
+$$\pi_n(\mathbb{S}^3) = \begin{cases}
+\mathbb{Z} & n = 3 \\
+0 & n > 3, n \neq 4 \\
+\mathbb{Z}_2 & n = 4 \\
+\end{cases}$$
+- Spacer preceded by backslash:
+\[
+\boxed{
+\begin{aligned}
+N_{\text{att}}^{\text{(MHA)}} &=
+h \bigl[\, d_{\text{model}}\;d_{k} + d_{\text{model}}\;d_{v}\, \bigr]   && (\text{Q,K,V の重み})\\
+&\quad+ h(d_{k}+d_{k}+d_{v})                                          && (\text{バイアス Q,K,V）}\\[4pt]
+&\quad+ (h d_{v})\, d_{\text{model}}                                 && (\text{出力射影 }W^{O})\\
+&\quad+ d_{\text{model}}                                            && (\text{バイアス }b^{O})
+\end{aligned}}
+\]
+
+## Formulas in a Table
+
+| Area | Expression | Comment |
+|------|------------|---------|
+| **Algebra** | \[
+x = \frac{-b \pm \sqrt{\,b^{2}-4ac\,}}{2a}
+\] | Quadratic formula |
+| | \[
+(a+b)^{n} = \sum_{k=0}^{n}\binom{n}{k}\,a^{\,n-k}\,b^{\,k}
+\] | Binomial theorem |
+| | \(\displaystyle \prod_{k=1}^{n}k = n! \) | Factorial definition |
+| **Geometry** | \( \mathbf{a}\cdot \mathbf{b} = \|\mathbf{a}\|\,\|\mathbf{b}\|\,\cos\theta \) | Dot product & angle |
+
+## No math (but chemical)
+
+Balanced chemical reaction with states:
+
+\[
+\ce{2H2(g) + O2(g) -> 2H2O(l)}
+\]
+
+The standard enthalpy change for the reaction is: $\Delta H^\circ = \pu{-572 kJ mol^{-1}}$.
+
+---
+
+*This document showcases various mathematical notation and formulas that can be rendered in markdown using LaTeX syntax.*
+`;
diff --git a/llama.cpp/tools/server/webui/tests/stories/fixtures/readme.ts b/llama.cpp/tools/server/webui/tests/stories/fixtures/readme.ts
new file mode 100644
index 0000000..e8b573d
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/fixtures/readme.ts
@@ -0,0 +1,136 @@
+// README Content
+export const README_MD = String.raw`
+# 🚀 Awesome Web Framework
+
+[![npm version](https://img.shields.io/npm/v/awesome-framework.svg)](https://www.npmjs.com/package/awesome-framework)
+[![Build Status](https://github.com/awesome/framework/workflows/CI/badge.svg)](https://github.com/awesome/framework/actions)
+[![Coverage](https://codecov.io/gh/awesome/framework/branch/main/graph/badge.svg)](https://codecov.io/gh/awesome/framework)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+
+> A modern, fast, and flexible web framework for building scalable applications
+
+## ✨ Features
+
+- 🎯 **Type-Safe** - Full TypeScript support out of the box
+- ⚡ **Lightning Fast** - Built on Vite for instant HMR
+- 📦 **Zero Config** - Works out of the box for most use cases
+- 🎨 **Flexible** - Unopinionated with sensible defaults
+- 🔧 **Extensible** - Plugin system for custom functionality
+- 📱 **Responsive** - Mobile-first approach
+- 🌍 **i18n Ready** - Built-in internationalization
+- 🔒 **Secure** - Security best practices by default
+
+## 📦 Installation
+
+${'```'}bash
+npm install awesome-framework
+# or
+yarn add awesome-framework
+# or
+pnpm add awesome-framework
+${'```'}
+
+## 🚀 Quick Start
+
+### Create a new project
+
+${'```'}bash
+npx create-awesome-app my-app
+cd my-app
+npm run dev
+${'```'}
+
+### Basic Example
+
+${'```'}javascript
+import { createApp } from 'awesome-framework';
+
+const app = createApp({
+  port: 3000,
+  middleware: ['cors', 'helmet', 'compression']
+});
+
+app.get('/', (req, res) => {
+  res.json({ message: 'Hello World!' });
+});
+
+app.listen(() => {
+  console.log('Server running on http://localhost:3000');
+});
+${'```'}
+
+## 📖 Documentation
+
+### Core Concepts
+
+- [Getting Started](https://docs.awesome.dev/getting-started)
+- [Configuration](https://docs.awesome.dev/configuration)
+- [Routing](https://docs.awesome.dev/routing)
+- [Middleware](https://docs.awesome.dev/middleware)
+- [Database](https://docs.awesome.dev/database)
+- [Authentication](https://docs.awesome.dev/authentication)
+
+### Advanced Topics
+
+- [Performance Optimization](https://docs.awesome.dev/performance)
+- [Deployment](https://docs.awesome.dev/deployment)
+- [Testing](https://docs.awesome.dev/testing)
+- [Security](https://docs.awesome.dev/security)
+
+## 🛠️ Development
+
+### Prerequisites
+
+- Node.js >= 18
+- pnpm >= 8
+
+### Setup
+
+${'```'}bash
+git clone https://github.com/awesome/framework.git
+cd framework
+pnpm install
+pnpm dev
+${'```'}
+
+### Testing
+
+${'```'}bash
+pnpm test        # Run unit tests
+pnpm test:e2e    # Run end-to-end tests
+pnpm test:watch  # Run tests in watch mode
+${'```'}
+
+## 🤝 Contributing
+
+We welcome contributions! Please see our [Contributing Guide](CONTRIBUTING.md) for details.
+
+### Contributors
+
+<a href="https://github.com/awesome/framework/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=awesome/framework" />
+</a>
+
+## 📊 Benchmarks
+
+| Framework | Requests/sec | Latency (ms) | Memory (MB) |
+|-----------|-------------|--------------|-------------|
+| **Awesome** | **45,230** | **2.1** | **42** |
+| Express | 28,450 | 3.5 | 68 |
+| Fastify | 41,200 | 2.3 | 48 |
+| Koa | 32,100 | 3.1 | 52 |
+
+*Benchmarks performed on MacBook Pro M2, Node.js 20.x*
+
+## 📝 License
+
+MIT © [Awesome Team](https://github.com/awesome)
+
+## 🙏 Acknowledgments
+
+Special thanks to all our sponsors and contributors who make this project possible.
+
+---
+
+**[Website](https://awesome.dev)** • **[Documentation](https://docs.awesome.dev)** • **[Discord](https://discord.gg/awesome)** • **[Twitter](https://twitter.com/awesomeframework)**
+`;
diff --git a/llama.cpp/tools/server/webui/tests/stories/fixtures/storybook-mocks.ts b/llama.cpp/tools/server/webui/tests/stories/fixtures/storybook-mocks.ts
new file mode 100644
index 0000000..c40a746
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/stories/fixtures/storybook-mocks.ts
@@ -0,0 +1,81 @@
+import { serverStore } from '$lib/stores/server.svelte';
+import { modelsStore } from '$lib/stores/models.svelte';
+
+/**
+ * Mock server properties for Storybook testing
+ * This utility allows setting mock server configurations without polluting production code
+ */
+export function mockServerProps(props: Partial<ApiLlamaCppServerProps>): void {
+	// Reset any pointer-events from previous tests (dropdown cleanup)
+	const body = document.querySelector('body');
+	if (body) body.style.pointerEvents = '';
+
+	// Directly set the props for testing purposes
+	(serverStore as unknown as { props: ApiLlamaCppServerProps }).props = {
+		model_path: props.model_path || 'test-model',
+		modalities: {
+			vision: props.modalities?.vision ?? false,
+			audio: props.modalities?.audio ?? false
+		},
+		...props
+	} as ApiLlamaCppServerProps;
+
+	// Set router mode role so activeModelId can be set
+	(serverStore as unknown as { props: ApiLlamaCppServerProps }).props.role = 'ROUTER';
+
+	// Also mock modelsStore methods for modality checking
+	const vision = props.modalities?.vision ?? false;
+	const audio = props.modalities?.audio ?? false;
+
+	// eslint-disable-next-line @typescript-eslint/no-explicit-any
+	(modelsStore as any).modelSupportsVision = () => vision;
+	// eslint-disable-next-line @typescript-eslint/no-explicit-any
+	(modelsStore as any).modelSupportsAudio = () => audio;
+
+	// Mock models list with a test model so activeModelId can be resolved
+	// eslint-disable-next-line @typescript-eslint/no-explicit-any
+	(modelsStore as any).models = [
+		{
+			id: 'test-model',
+			name: 'Test Model',
+			model: 'test-model'
+		}
+	];
+
+	// Mock selectedModelId
+	// eslint-disable-next-line @typescript-eslint/no-explicit-any
+	(modelsStore as any).selectedModelId = 'test-model';
+}
+
+/**
+ * Reset server store to clean state for testing
+ */
+export function resetServerStore(): void {
+	(serverStore as unknown as { props: ApiLlamaCppServerProps }).props = {
+		model_path: '',
+		modalities: {
+			vision: false,
+			audio: false
+		}
+	} as ApiLlamaCppServerProps;
+	(serverStore as unknown as { error: string }).error = '';
+	(serverStore as unknown as { loading: boolean }).loading = false;
+}
+
+/**
+ * Common mock configurations for Storybook stories
+ */
+export const mockConfigs = {
+	visionOnly: {
+		modalities: { vision: true, audio: false }
+	},
+	audioOnly: {
+		modalities: { vision: false, audio: true }
+	},
+	bothModalities: {
+		modalities: { vision: true, audio: true }
+	},
+	noModalities: {
+		modalities: { vision: false, audio: false }
+	}
+} as const;
diff --git a/llama.cpp/tools/server/webui/tests/unit/clipboard.test.ts b/llama.cpp/tools/server/webui/tests/unit/clipboard.test.ts
new file mode 100644
index 0000000..d8ea489
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/unit/clipboard.test.ts
@@ -0,0 +1,423 @@
+import { describe, it, expect } from 'vitest';
+import { AttachmentType } from '$lib/enums';
+import {
+	formatMessageForClipboard,
+	parseClipboardContent,
+	hasClipboardAttachments
+} from '$lib/utils/clipboard';
+
+describe('formatMessageForClipboard', () => {
+	it('returns plain content when no extras', () => {
+		const result = formatMessageForClipboard('Hello world', undefined);
+		expect(result).toBe('Hello world');
+	});
+
+	it('returns plain content when extras is empty array', () => {
+		const result = formatMessageForClipboard('Hello world', []);
+		expect(result).toBe('Hello world');
+	});
+
+	it('handles empty string content', () => {
+		const result = formatMessageForClipboard('', undefined);
+		expect(result).toBe('');
+	});
+
+	it('returns plain content when extras has only non-text attachments', () => {
+		const extras = [
+			{
+				type: AttachmentType.IMAGE as const,
+				name: 'image.png',
+				base64Url: 'data:image/png;base64,...'
+			}
+		];
+		const result = formatMessageForClipboard('Hello world', extras);
+		expect(result).toBe('Hello world');
+	});
+
+	it('filters non-text attachments and keeps only text ones', () => {
+		const extras = [
+			{
+				type: AttachmentType.IMAGE as const,
+				name: 'image.png',
+				base64Url: 'data:image/png;base64,...'
+			},
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'file.txt',
+				content: 'Text content'
+			},
+			{
+				type: AttachmentType.PDF as const,
+				name: 'doc.pdf',
+				base64Data: 'data:application/pdf;base64,...',
+				content: 'PDF content',
+				processedAsImages: false
+			}
+		];
+		const result = formatMessageForClipboard('Hello', extras);
+
+		expect(result).toContain('"file.txt"');
+		expect(result).not.toContain('image.png');
+		expect(result).not.toContain('doc.pdf');
+	});
+
+	it('formats message with text attachments', () => {
+		const extras = [
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'file1.txt',
+				content: 'File 1 content'
+			},
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'file2.txt',
+				content: 'File 2 content'
+			}
+		];
+		const result = formatMessageForClipboard('Hello world', extras);
+
+		expect(result).toContain('"Hello world"');
+		expect(result).toContain('"type": "TEXT"');
+		expect(result).toContain('"name": "file1.txt"');
+		expect(result).toContain('"content": "File 1 content"');
+		expect(result).toContain('"name": "file2.txt"');
+	});
+
+	it('handles content with quotes and special characters', () => {
+		const content = 'Hello "world" with\nnewline';
+		const extras = [
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'test.txt',
+				content: 'Test content'
+			}
+		];
+		const result = formatMessageForClipboard(content, extras);
+
+		// Should be valid JSON
+		expect(result.startsWith('"')).toBe(true);
+		// The content should be properly escaped
+		const parsed = JSON.parse(result.split('\n')[0]);
+		expect(parsed).toBe(content);
+	});
+
+	it('converts legacy context type to TEXT type', () => {
+		const extras = [
+			{
+				type: AttachmentType.LEGACY_CONTEXT as const,
+				name: 'legacy.txt',
+				content: 'Legacy content'
+			}
+		];
+		const result = formatMessageForClipboard('Hello', extras);
+
+		expect(result).toContain('"type": "TEXT"');
+		expect(result).not.toContain('"context"');
+	});
+
+	it('handles attachment content with special characters', () => {
+		const extras = [
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'code.js',
+				content: 'const x = "hello\\nworld";\nconst y = `template ${var}`;'
+			}
+		];
+		const formatted = formatMessageForClipboard('Check this code', extras);
+		const parsed = parseClipboardContent(formatted);
+
+		expect(parsed.textAttachments[0].content).toBe(
+			'const x = "hello\\nworld";\nconst y = `template ${var}`;'
+		);
+	});
+
+	it('handles unicode characters in content and attachments', () => {
+		const extras = [
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'unicode.txt',
+				content: '日本語テスト 🎉 émojis'
+			}
+		];
+		const formatted = formatMessageForClipboard('Привет мир 👋', extras);
+		const parsed = parseClipboardContent(formatted);
+
+		expect(parsed.message).toBe('Привет мир 👋');
+		expect(parsed.textAttachments[0].content).toBe('日本語テスト 🎉 émojis');
+	});
+
+	it('formats as plain text when asPlainText is true', () => {
+		const extras = [
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'file1.txt',
+				content: 'File 1 content'
+			},
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'file2.txt',
+				content: 'File 2 content'
+			}
+		];
+		const result = formatMessageForClipboard('Hello world', extras, true);
+
+		expect(result).toBe('Hello world\n\nFile 1 content\n\nFile 2 content');
+	});
+
+	it('returns plain content when asPlainText is true but no attachments', () => {
+		const result = formatMessageForClipboard('Hello world', [], true);
+		expect(result).toBe('Hello world');
+	});
+
+	it('plain text mode does not use JSON format', () => {
+		const extras = [
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'test.txt',
+				content: 'Test content'
+			}
+		];
+		const result = formatMessageForClipboard('Hello', extras, true);
+
+		expect(result).not.toContain('"type"');
+		expect(result).not.toContain('[');
+		expect(result).toBe('Hello\n\nTest content');
+	});
+});
+
+describe('parseClipboardContent', () => {
+	it('returns plain text as message when not in special format', () => {
+		const result = parseClipboardContent('Hello world');
+
+		expect(result.message).toBe('Hello world');
+		expect(result.textAttachments).toHaveLength(0);
+	});
+
+	it('handles empty string input', () => {
+		const result = parseClipboardContent('');
+
+		expect(result.message).toBe('');
+		expect(result.textAttachments).toHaveLength(0);
+	});
+
+	it('handles whitespace-only input', () => {
+		const result = parseClipboardContent('   \n\t  ');
+
+		expect(result.message).toBe('   \n\t  ');
+		expect(result.textAttachments).toHaveLength(0);
+	});
+
+	it('returns plain text as message when starts with quote but invalid format', () => {
+		const result = parseClipboardContent('"Unclosed quote');
+
+		expect(result.message).toBe('"Unclosed quote');
+		expect(result.textAttachments).toHaveLength(0);
+	});
+
+	it('returns original text when JSON array is malformed', () => {
+		const input = '"Hello"\n[invalid json';
+
+		const result = parseClipboardContent(input);
+
+		expect(result.message).toBe('"Hello"\n[invalid json');
+		expect(result.textAttachments).toHaveLength(0);
+	});
+
+	it('parses message with text attachments', () => {
+		const input = `"Hello world"
+[
+  {"type":"TEXT","name":"file1.txt","content":"File 1 content"},
+  {"type":"TEXT","name":"file2.txt","content":"File 2 content"}
+]`;
+
+		const result = parseClipboardContent(input);
+
+		expect(result.message).toBe('Hello world');
+		expect(result.textAttachments).toHaveLength(2);
+		expect(result.textAttachments[0].name).toBe('file1.txt');
+		expect(result.textAttachments[0].content).toBe('File 1 content');
+		expect(result.textAttachments[1].name).toBe('file2.txt');
+		expect(result.textAttachments[1].content).toBe('File 2 content');
+	});
+
+	it('handles escaped quotes in message', () => {
+		const input = `"Hello \\"world\\" with quotes"
+[
+  {"type":"TEXT","name":"file.txt","content":"test"}
+]`;
+
+		const result = parseClipboardContent(input);
+
+		expect(result.message).toBe('Hello "world" with quotes');
+		expect(result.textAttachments).toHaveLength(1);
+	});
+
+	it('handles newlines in message', () => {
+		const input = `"Hello\\nworld"
+[
+  {"type":"TEXT","name":"file.txt","content":"test"}
+]`;
+
+		const result = parseClipboardContent(input);
+
+		expect(result.message).toBe('Hello\nworld');
+		expect(result.textAttachments).toHaveLength(1);
+	});
+
+	it('returns message only when no array follows', () => {
+		const input = '"Just a quoted string"';
+
+		const result = parseClipboardContent(input);
+
+		expect(result.message).toBe('Just a quoted string');
+		expect(result.textAttachments).toHaveLength(0);
+	});
+
+	it('filters out invalid attachment objects', () => {
+		const input = `"Hello"
+[
+  {"type":"TEXT","name":"valid.txt","content":"valid"},
+  {"type":"INVALID","name":"invalid.txt","content":"invalid"},
+  {"name":"missing-type.txt","content":"missing"},
+  {"type":"TEXT","content":"missing name"}
+]`;
+
+		const result = parseClipboardContent(input);
+
+		expect(result.message).toBe('Hello');
+		expect(result.textAttachments).toHaveLength(1);
+		expect(result.textAttachments[0].name).toBe('valid.txt');
+	});
+
+	it('handles empty attachments array', () => {
+		const input = '"Hello"\n[]';
+
+		const result = parseClipboardContent(input);
+
+		expect(result.message).toBe('Hello');
+		expect(result.textAttachments).toHaveLength(0);
+	});
+
+	it('roundtrips correctly with formatMessageForClipboard', () => {
+		const originalContent = 'Hello "world" with\nspecial characters';
+		const originalExtras = [
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'file1.txt',
+				content: 'Content with\nnewlines and "quotes"'
+			},
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'file2.txt',
+				content: 'Another file'
+			}
+		];
+
+		const formatted = formatMessageForClipboard(originalContent, originalExtras);
+		const parsed = parseClipboardContent(formatted);
+
+		expect(parsed.message).toBe(originalContent);
+		expect(parsed.textAttachments).toHaveLength(2);
+		expect(parsed.textAttachments[0].name).toBe('file1.txt');
+		expect(parsed.textAttachments[0].content).toBe('Content with\nnewlines and "quotes"');
+		expect(parsed.textAttachments[1].name).toBe('file2.txt');
+		expect(parsed.textAttachments[1].content).toBe('Another file');
+	});
+});
+
+describe('hasClipboardAttachments', () => {
+	it('returns false for plain text', () => {
+		expect(hasClipboardAttachments('Hello world')).toBe(false);
+	});
+
+	it('returns false for empty string', () => {
+		expect(hasClipboardAttachments('')).toBe(false);
+	});
+
+	it('returns false for quoted string without attachments', () => {
+		expect(hasClipboardAttachments('"Hello world"')).toBe(false);
+	});
+
+	it('returns true for valid format with attachments', () => {
+		const input = `"Hello"
+[{"type":"TEXT","name":"file.txt","content":"test"}]`;
+
+		expect(hasClipboardAttachments(input)).toBe(true);
+	});
+
+	it('returns false for format with empty attachments array', () => {
+		const input = '"Hello"\n[]';
+
+		expect(hasClipboardAttachments(input)).toBe(false);
+	});
+
+	it('returns false for malformed JSON', () => {
+		expect(hasClipboardAttachments('"Hello"\n[broken')).toBe(false);
+	});
+});
+
+describe('roundtrip edge cases', () => {
+	it('preserves empty message with attachments', () => {
+		const extras = [
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'file.txt',
+				content: 'Content only'
+			}
+		];
+		const formatted = formatMessageForClipboard('', extras);
+		const parsed = parseClipboardContent(formatted);
+
+		expect(parsed.message).toBe('');
+		expect(parsed.textAttachments).toHaveLength(1);
+		expect(parsed.textAttachments[0].content).toBe('Content only');
+	});
+
+	it('preserves attachment with empty content', () => {
+		const extras = [
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'empty.txt',
+				content: ''
+			}
+		];
+		const formatted = formatMessageForClipboard('Message', extras);
+		const parsed = parseClipboardContent(formatted);
+
+		expect(parsed.message).toBe('Message');
+		expect(parsed.textAttachments).toHaveLength(1);
+		expect(parsed.textAttachments[0].content).toBe('');
+	});
+
+	it('preserves multiple backslashes', () => {
+		const content = 'Path: C:\\\\Users\\\\test\\\\file.txt';
+		const extras = [
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'path.txt',
+				content: 'D:\\\\Data\\\\file'
+			}
+		];
+		const formatted = formatMessageForClipboard(content, extras);
+		const parsed = parseClipboardContent(formatted);
+
+		expect(parsed.message).toBe(content);
+		expect(parsed.textAttachments[0].content).toBe('D:\\\\Data\\\\file');
+	});
+
+	it('preserves tabs and various whitespace', () => {
+		const content = 'Line1\t\tTabbed\n  Spaced\r\nCRLF';
+		const extras = [
+			{
+				type: AttachmentType.TEXT as const,
+				name: 'whitespace.txt',
+				content: '\t\t\n\n   '
+			}
+		];
+		const formatted = formatMessageForClipboard(content, extras);
+		const parsed = parseClipboardContent(formatted);
+
+		expect(parsed.message).toBe(content);
+		expect(parsed.textAttachments[0].content).toBe('\t\t\n\n   ');
+	});
+});
diff --git a/llama.cpp/tools/server/webui/tests/unit/latex-protection.test.ts b/llama.cpp/tools/server/webui/tests/unit/latex-protection.test.ts
new file mode 100644
index 0000000..84328db
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/unit/latex-protection.test.ts
@@ -0,0 +1,376 @@
+/* eslint-disable no-irregular-whitespace */
+import { describe, it, expect, test } from 'vitest';
+import { maskInlineLaTeX, preprocessLaTeX } from '$lib/utils/latex-protection';
+
+describe('maskInlineLaTeX', () => {
+	it('should protect LaTeX $x + y$ but not money $3.99', () => {
+		const latexExpressions: string[] = [];
+		const input = 'I have $10, $3.99 and $x + y$ and $100x$. The amount is $2,000.';
+		const output = maskInlineLaTeX(input, latexExpressions);
+
+		expect(output).toBe('I have $10, $3.99 and <<LATEX_0>> and <<LATEX_1>>. The amount is $2,000.');
+		expect(latexExpressions).toEqual(['$x + y$', '$100x$']);
+	});
+
+	it('should ignore money like $5 and $12.99', () => {
+		const latexExpressions: string[] = [];
+		const input = 'Prices are $12.99 and $5. Tax?';
+		const output = maskInlineLaTeX(input, latexExpressions);
+
+		expect(output).toBe('Prices are $12.99 and $5. Tax?');
+		expect(latexExpressions).toEqual([]);
+	});
+
+	it('should protect inline math $a^2 + b^2$ even after text', () => {
+		const latexExpressions: string[] = [];
+		const input = 'Pythagorean: $a^2 + b^2 = c^2$.';
+		const output = maskInlineLaTeX(input, latexExpressions);
+
+		expect(output).toBe('Pythagorean: <<LATEX_0>>.');
+		expect(latexExpressions).toEqual(['$a^2 + b^2 = c^2$']);
+	});
+
+	it('should not protect math that has letter after closing $ (e.g. units)', () => {
+		const latexExpressions: string[] = [];
+		const input = 'The cost is $99 and change.';
+		const output = maskInlineLaTeX(input, latexExpressions);
+
+		expect(output).toBe('The cost is $99 and change.');
+		expect(latexExpressions).toEqual([]);
+	});
+
+	it('should allow $x$ followed by punctuation', () => {
+		const latexExpressions: string[] = [];
+		const input = 'We know $x$, right?';
+		const output = maskInlineLaTeX(input, latexExpressions);
+
+		expect(output).toBe('We know <<LATEX_0>>, right?');
+		expect(latexExpressions).toEqual(['$x$']);
+	});
+
+	it('should work across multiple lines', () => {
+		const latexExpressions: string[] = [];
+		const input = `Emma buys cupcakes for $3 each.\nHow much is $x + y$?`;
+		const output = maskInlineLaTeX(input, latexExpressions);
+
+		expect(output).toBe(`Emma buys cupcakes for $3 each.\nHow much is <<LATEX_0>>?`);
+		expect(latexExpressions).toEqual(['$x + y$']);
+	});
+
+	it('should not protect $100 but protect $matrix$', () => {
+		const latexExpressions: string[] = [];
+		const input = '$100 and $\\mathrm{GL}_2(\\mathbb{F}_7)$ are different.';
+		const output = maskInlineLaTeX(input, latexExpressions);
+
+		expect(output).toBe('$100 and <<LATEX_0>> are different.');
+		expect(latexExpressions).toEqual(['$\\mathrm{GL}_2(\\mathbb{F}_7)$']);
+	});
+
+	it('should skip if $ is followed by digit and alphanumeric after close (money)', () => {
+		const latexExpressions: string[] = [];
+		const input = 'I paid $5 quickly.';
+		const output = maskInlineLaTeX(input, latexExpressions);
+
+		expect(output).toBe('I paid $5 quickly.');
+		expect(latexExpressions).toEqual([]);
+	});
+
+	it('should protect LaTeX even with special chars inside', () => {
+		const latexExpressions: string[] = [];
+		const input = 'Consider $\\alpha_1 + \\beta_2$ now.';
+		const output = maskInlineLaTeX(input, latexExpressions);
+
+		expect(output).toBe('Consider <<LATEX_0>> now.');
+		expect(latexExpressions).toEqual(['$\\alpha_1 + \\beta_2$']);
+	});
+
+	it('short text', () => {
+		const latexExpressions: string[] = ['$0$'];
+		const input = '$a$\n$a$ and $b$';
+		const output = maskInlineLaTeX(input, latexExpressions);
+
+		expect(output).toBe('<<LATEX_1>>\n<<LATEX_2>> and <<LATEX_3>>');
+		expect(latexExpressions).toEqual(['$0$', '$a$', '$a$', '$b$']);
+	});
+
+	it('empty text', () => {
+		const latexExpressions: string[] = [];
+		const input = '$\n$$\n';
+		const output = maskInlineLaTeX(input, latexExpressions);
+
+		expect(output).toBe('$\n$$\n');
+		expect(latexExpressions).toEqual([]);
+	});
+
+	it('LaTeX-spacer preceded by backslash', () => {
+		const latexExpressions: string[] = [];
+		const input = `\\[
+\\boxed{
+\\begin{aligned}
+N_{\\text{att}}^{\\text{(MHA)}} &=
+h \\bigl[\\, d_{\\text{model}}\\;d_{k} + d_{\\text{model}}\\;d_{v}\\, \\bigr]   && (\\text{Q,K,V の重み})\\\\
+&\\quad+ h(d_{k}+d_{k}+d_{v})                                          && (\\text{バイアス Q,K,V）}\\\\[4pt]
+&\\quad+ (h d_{v})\\, d_{\\text{model}}                                 && (\\text{出力射影 }W^{O})\\\\
+&\\quad+ d_{\\text{model}}                                            && (\\text{バイアス }b^{O})
+\\end{aligned}}
+\\]`;
+		const output = maskInlineLaTeX(input, latexExpressions);
+
+		expect(output).toBe(input);
+		expect(latexExpressions).toEqual([]);
+	});
+});
+
+describe('preprocessLaTeX', () => {
+	test('converts inline \\( ... \\) to $...$', () => {
+		const input =
+			'\\( \\mathrm{GL}_2(\\mathbb{F}_7) \\): Group of invertible matrices with entries in \\(\\mathbb{F}_7\\).';
+		const output = preprocessLaTeX(input);
+		expect(output).toBe(
+			'$ \\mathrm{GL}_2(\\mathbb{F}_7) $: Group of invertible matrices with entries in $\\mathbb{F}_7$.'
+		);
+	});
+
+	test("don't inline \\\\( ... \\) to $...$", () => {
+		const input =
+			'Chapter 20 of The TeXbook, in source "Definitions\\\\(also called Macros)", containst the formula \\((x_1,\\ldots,x_n)\\).';
+		const output = preprocessLaTeX(input);
+		expect(output).toBe(
+			'Chapter 20 of The TeXbook, in source "Definitions\\\\(also called Macros)", containst the formula $(x_1,\\ldots,x_n)$.'
+		);
+	});
+
+	test('preserves display math \\[ ... \\] and protects adjacent text', () => {
+		const input = `Some kernel of \\(\\mathrm{SL}_2(\\mathbb{F}_7)\\):
+  \\[
+  \\left\\{ \\begin{pmatrix} 1 & 0 \\\\ 0 & 1 \\end{pmatrix}, \\begin{pmatrix} -1 & 0 \\\\ 0 & -1 \\end{pmatrix} \\right\\} = \\{\\pm I\\}
+  \\]`;
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(`Some kernel of $\\mathrm{SL}_2(\\mathbb{F}_7)$:
+  $$
+  \\left\\{ \\begin{pmatrix} 1 & 0 \\\\ 0 & 1 \\end{pmatrix}, \\begin{pmatrix} -1 & 0 \\\\ 0 & -1 \\end{pmatrix} \\right\\} = \\{\\pm I\\}
+  $$`);
+	});
+
+	test('handles standalone display math equation', () => {
+		const input = `Algebra:
+\\[
+x = \\frac{-b \\pm \\sqrt{\\,b^{2}-4ac\\,}}{2a}
+\\]`;
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(`Algebra:
+$$
+x = \\frac{-b \\pm \\sqrt{\\,b^{2}-4ac\\,}}{2a}
+$$`);
+	});
+
+	test('does not interpret currency values as LaTeX', () => {
+		const input = 'I have $10, $3.99 and $x + y$ and $100x$. The amount is $2,000.';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe('I have \\$10, \\$3.99 and $x + y$ and $100x$. The amount is \\$2,000.');
+	});
+
+	test('ignores dollar signs followed by digits (money), but keeps valid math $x + y$', () => {
+		const input = 'I have $10, $3.99 and $x + y$ and $100x$. The amount is $2,000.';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe('I have \\$10, \\$3.99 and $x + y$ and $100x$. The amount is \\$2,000.');
+	});
+
+	test('handles real-world word problems with amounts and no math delimiters', () => {
+		const input =
+			'Emma buys 2 cupcakes for $3 each and 1 cookie for $1.50. How much money does she spend in total?';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(
+			'Emma buys 2 cupcakes for \\$3 each and 1 cookie for \\$1.50. How much money does she spend in total?'
+		);
+	});
+
+	test('handles decimal amounts in word problem correctly', () => {
+		const input =
+			'Maria has $20. She buys a notebook for $4.75 and a pack of pencils for $3.25. How much change does she receive?';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(
+			'Maria has \\$20. She buys a notebook for \\$4.75 and a pack of pencils for \\$3.25. How much change does she receive?'
+		);
+	});
+
+	test('preserves display math with surrounding non-ASCII text', () => {
+		const input = `1 kg の質量は
+  \\[
+  E = (1\\ \\text{kg}) \\times (3.0 \\times 10^8\\ \\text{m/s})^2 \\approx 9.0 \\times 10^{16}\\ \\text{J}
+  \\]
+  というエネルギーに相当します。これは約 21 百万トンの TNT が爆発したときのエネルギーに匹敵します。`;
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(
+			`1 kg の質量は
+  $$
+  E = (1\\ \\text{kg}) \\times (3.0 \\times 10^8\\ \\text{m/s})^2 \\approx 9.0 \\times 10^{16}\\ \\text{J}
+  $$
+  というエネルギーに相当します。これは約 21 百万トンの TNT が爆発したときのエネルギーに匹敵します。`
+		);
+	});
+
+	test('LaTeX-spacer preceded by backslash', () => {
+		const input = `\\[
+\\boxed{
+\\begin{aligned}
+N_{\\text{att}}^{\\text{(MHA)}} &=
+h \\bigl[\\, d_{\\text{model}}\\;d_{k} + d_{\\text{model}}\\;d_{v}\\, \\bigr]   && (\\text{Q,K,V の重み})\\\\
+&\\quad+ h(d_{k}+d_{k}+d_{v})                                          && (\\text{バイアス Q,K,V）}\\\\[4pt]
+&\\quad+ (h d_{v})\\, d_{\\text{model}}                                 && (\\text{出力射影 }W^{O})\\\\
+&\\quad+ d_{\\text{model}}                                            && (\\text{バイアス }b^{O})
+\\end{aligned}}
+\\]`;
+		const output = preprocessLaTeX(input);
+		expect(output).toBe(
+			`$$
+\\boxed{
+\\begin{aligned}
+N_{\\text{att}}^{\\text{(MHA)}} &=
+h \\bigl[\\, d_{\\text{model}}\\;d_{k} + d_{\\text{model}}\\;d_{v}\\, \\bigr]   && (\\text{Q,K,V の重み})\\\\
+&\\quad+ h(d_{k}+d_{k}+d_{v})                                          && (\\text{バイアス Q,K,V）}\\\\[4pt]
+&\\quad+ (h d_{v})\\, d_{\\text{model}}                                 && (\\text{出力射影 }W^{O})\\\\
+&\\quad+ d_{\\text{model}}                                            && (\\text{バイアス }b^{O})
+\\end{aligned}}
+$$`
+		);
+	});
+
+	test('converts \\[ ... \\] even when preceded by text without space', () => {
+		const input = 'Some line ...\nAlgebra: \\[x = \\frac{-b \\pm \\sqrt{\\,b^{2}-4ac\\,}}{2a}\\]';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(
+			'Some line ...\nAlgebra: \n$$x = \\frac{-b \\pm \\sqrt{\\,b^{2}-4ac\\,}}{2a}$$\n'
+		);
+	});
+
+	test('converts \\[ ... \\] in table-cells', () => {
+		const input = `| ID | Expression |\n| #1 | \\[
+			x = \\frac{-b \\pm \\sqrt{\\,b^{2}-4ac\\,}}{2a}
+\\] |`;
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(
+			'| ID | Expression |\n| #1 | $x = \\frac{-b \\pm \\sqrt{\\,b^{2}-4ac\\,}}{2a}$ |'
+		);
+	});
+
+	test('escapes isolated $ before digits ($5 → \\$5), but not valid math', () => {
+		const input = 'This costs $5 and this is math $x^2$. $100 is money.';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe('This costs \\$5 and this is math $x^2$. \\$100 is money.');
+		// Note: Since $x^2$ is detected as valid LaTeX, it's preserved.
+		// $5 becomes \$5 only *after* real math is masked — but here it's correct because the masking logic avoids treating $5 as math.
+	});
+
+	test('display with LaTeX-line-breaks', () => {
+		const input = String.raw`- Algebraic topology, Homotopy Groups of $\mathbb{S}^3$:
+$$\pi_n(\mathbb{S}^3) = \begin{cases}
+\mathbb{Z} & n = 3 \\
+0 & n > 3, n \neq 4 \\
+\mathbb{Z}_2 & n = 4 \\
+\end{cases}$$`;
+		const output = preprocessLaTeX(input);
+		// If the formula contains '\\' the $$-delimiters should be in their own line.
+		expect(output).toBe(`- Algebraic topology, Homotopy Groups of $\\mathbb{S}^3$:
+$$\n\\pi_n(\\mathbb{S}^3) = \\begin{cases}
+\\mathbb{Z} & n = 3 \\\\
+0 & n > 3, n \\neq 4 \\\\
+\\mathbb{Z}_2 & n = 4 \\\\
+\\end{cases}\n$$`);
+	});
+
+	test('handles mhchem notation safely if present', () => {
+		const input = 'Chemical reaction: \\( \\ce{H2O} \\) and $\\ce{CO2}$';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe('Chemical reaction: $ \\ce{H2O} $ and $\\ce{CO2}$');
+	});
+
+	test('preserves code blocks', () => {
+		const input = 'Inline code: `sum $total` and block:\n```\ndollar $amount\n```\nEnd.';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input); // Code blocks prevent misinterpretation
+	});
+
+	test('preserves backslash parentheses in code blocks (GitHub issue)', () => {
+		const input = '```python\nfoo = "\\(bar\\)"\n```';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
+	});
+
+	test('preserves backslash brackets in code blocks', () => {
+		const input = '```python\nfoo = "\\[bar\\]"\n```';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied
+	});
+
+	test('preserves backslash parentheses in inline code', () => {
+		const input = 'Use `foo = "\\(bar\\)"` in your code.';
+		const output = preprocessLaTeX(input);
+
+		expect(output).toBe(input);
+	});
+
+	test('escape backslash in mchem ce', () => {
+		const input = 'mchem ce:\n$\\ce{2H2(g) + O2(g) -> 2H2O(l)}$';
+		const output = preprocessLaTeX(input);
+
+		// mhchem-escape would insert a backslash here.
+		expect(output).toBe('mchem ce:\n$\\ce{2H2(g) + O2(g) -> 2H2O(l)}$');
+	});
+
+	test('escape backslash in mchem pu', () => {
+		const input = 'mchem pu:\n$\\pu{-572 kJ mol^{-1}}$';
+		const output = preprocessLaTeX(input);
+
+		// mhchem-escape would insert a backslash here.
+		expect(output).toBe('mchem pu:\n$\\pu{-572 kJ mol^{-1}}$');
+	});
+
+	test('LaTeX in blockquotes with display math', () => {
+		const input =
+			'> **Definition (limit):**  \n>  \\[\n>  \\lim_{x\\to a} f(x) = L\n>  \\]\n>  means that as \\(x\\) gets close to \\(a\\).';
+		const output = preprocessLaTeX(input);
+
+		// Blockquote markers should be preserved, LaTeX should be converted
+		expect(output).toContain('> **Definition (limit):**');
+		expect(output).toContain('$$');
+		expect(output).toContain('$x$');
+		expect(output).not.toContain('\\[');
+		expect(output).not.toContain('\\]');
+		expect(output).not.toContain('\\(');
+		expect(output).not.toContain('\\)');
+	});
+
+	test('LaTeX in blockquotes with inline math', () => {
+		const input =
+			"> The derivative \\(f'(x)\\) at point \\(x=a\\) measures slope.\n> Formula: \\(f'(a)=\\lim_{h\\to 0}\\frac{f(a+h)-f(a)}{h}\\)";
+		const output = preprocessLaTeX(input);
+
+		// Blockquote markers should be preserved, inline LaTeX converted to $...$
+		expect(output).toContain("> The derivative $f'(x)$ at point $x=a$ measures slope.");
+		expect(output).toContain("> Formula: $f'(a)=\\lim_{h\\to 0}\\frac{f(a+h)-f(a)}{h}$");
+	});
+
+	test('Mixed content with blockquotes and regular text', () => {
+		const input =
+			'Regular text with \\(x^2\\).\n\n> Quote with \\(y^2\\).\n\nMore text with \\(z^2\\).';
+		const output = preprocessLaTeX(input);
+
+		// All LaTeX should be converted, blockquote markers preserved
+		expect(output).toBe('Regular text with $x^2$.\n\n> Quote with $y^2$.\n\nMore text with $z^2$.');
+	});
+});
diff --git a/llama.cpp/tools/server/webui/tests/unit/model-names.test.ts b/llama.cpp/tools/server/webui/tests/unit/model-names.test.ts
new file mode 100644
index 0000000..40c5a0e
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tests/unit/model-names.test.ts
@@ -0,0 +1,51 @@
+import { describe, expect, it } from 'vitest';
+import { isValidModelName, normalizeModelName } from '$lib/utils/model-names';
+
+describe('normalizeModelName', () => {
+	it('preserves Hugging Face org/model format (single slash)', () => {
+		// Single slash is treated as Hugging Face format and preserved
+		expect(normalizeModelName('meta-llama/Llama-3.1-8B')).toBe('meta-llama/Llama-3.1-8B');
+		expect(normalizeModelName('models/model-name-1')).toBe('models/model-name-1');
+	});
+
+	it('extracts filename from multi-segment paths', () => {
+		// Multiple slashes -> extract just the filename
+		expect(normalizeModelName('path/to/model/model-name-2')).toBe('model-name-2');
+		expect(normalizeModelName('/absolute/path/to/model')).toBe('model');
+	});
+
+	it('extracts filename from backslash paths', () => {
+		expect(normalizeModelName('C\\Models\\model-name-1')).toBe('model-name-1');
+		expect(normalizeModelName('path\\to\\model\\model-name-2')).toBe('model-name-2');
+	});
+
+	it('handles mixed path separators', () => {
+		expect(normalizeModelName('path/to\\model/model-name-2')).toBe('model-name-2');
+	});
+
+	it('returns simple names as-is', () => {
+		expect(normalizeModelName('simple-model')).toBe('simple-model');
+		expect(normalizeModelName('model-name-2')).toBe('model-name-2');
+	});
+
+	it('trims whitespace', () => {
+		expect(normalizeModelName('  model-name  ')).toBe('model-name');
+	});
+
+	it('returns empty string for empty input', () => {
+		expect(normalizeModelName('')).toBe('');
+		expect(normalizeModelName('   ')).toBe('');
+	});
+});
+
+describe('isValidModelName', () => {
+	it('returns true for valid names', () => {
+		expect(isValidModelName('model')).toBe(true);
+		expect(isValidModelName('path/to/model.bin')).toBe(true);
+	});
+
+	it('returns false for empty values', () => {
+		expect(isValidModelName('')).toBe(false);
+		expect(isValidModelName('   ')).toBe(false);
+	});
+});
diff --git a/llama.cpp/tools/server/webui/tsconfig.json b/llama.cpp/tools/server/webui/tsconfig.json
new file mode 100644
index 0000000..0b2d886
--- /dev/null
+++ b/llama.cpp/tools/server/webui/tsconfig.json
@@ -0,0 +1,19 @@
+{
+	"extends": "./.svelte-kit/tsconfig.json",
+	"compilerOptions": {
+		"allowJs": true,
+		"checkJs": true,
+		"esModuleInterop": true,
+		"forceConsistentCasingInFileNames": true,
+		"resolveJsonModule": true,
+		"skipLibCheck": true,
+		"sourceMap": true,
+		"strict": true,
+		"moduleResolution": "bundler"
+	}
+	// Path aliases are handled by https://svelte.dev/docs/kit/configuration#alias
+	// except $lib which is handled by https://svelte.dev/docs/kit/configuration#files
+	//
+	// If you want to overwrite includes/excludes, make sure to copy over the relevant includes/excludes
+	// from the referenced tsconfig.json - TypeScript does not merge them in
+}
diff --git a/llama.cpp/tools/server/webui/vite.config.ts b/llama.cpp/tools/server/webui/vite.config.ts
new file mode 100644
index 0000000..5183c09
--- /dev/null
+++ b/llama.cpp/tools/server/webui/vite.config.ts
@@ -0,0 +1,166 @@
+import tailwindcss from '@tailwindcss/vite';
+import { sveltekit } from '@sveltejs/kit/vite';
+import * as fflate from 'fflate';
+import { readFileSync, writeFileSync, existsSync } from 'fs';
+import { resolve } from 'path';
+import { defineConfig } from 'vite';
+import devtoolsJson from 'vite-plugin-devtools-json';
+import { storybookTest } from '@storybook/addon-vitest/vitest-plugin';
+
+const GUIDE_FOR_FRONTEND = `
+<!--
+  This is a single file build of the frontend.
+  It is automatically generated by the build process.
+  Do not edit this file directly.
+  To make changes, refer to the "Web UI" section in the README.
+-->
+`.trim();
+
+const MAX_BUNDLE_SIZE = 2 * 1024 * 1024;
+
+/**
+ * the maximum size of an embedded asset in bytes,
+ * e.g. maximum size of embedded font (see node_modules/katex/dist/fonts/*.woff2)
+ */
+const MAX_ASSET_SIZE = 32000;
+
+/** public/index.html.gz minified flag */
+const ENABLE_JS_MINIFICATION = true;
+
+function llamaCppBuildPlugin() {
+	return {
+		name: 'llamacpp:build',
+		apply: 'build' as const,
+		closeBundle() {
+			// Ensure the SvelteKit adapter has finished writing to ../public
+			setTimeout(() => {
+				try {
+					const indexPath = resolve('../public/index.html');
+					const gzipPath = resolve('../public/index.html.gz');
+
+					if (!existsSync(indexPath)) {
+						return;
+					}
+
+					let content = readFileSync(indexPath, 'utf-8');
+
+					const faviconPath = resolve('static/favicon.svg');
+					if (existsSync(faviconPath)) {
+						const faviconContent = readFileSync(faviconPath, 'utf-8');
+						const faviconBase64 = Buffer.from(faviconContent).toString('base64');
+						const faviconDataUrl = `data:image/svg+xml;base64,${faviconBase64}`;
+
+						content = content.replace(/href="[^"]*favicon\.svg"/g, `href="${faviconDataUrl}"`);
+
+						console.log('✓ Inlined favicon.svg as base64 data URL');
+					}
+
+					content = content.replace(/\r/g, '');
+					content = GUIDE_FOR_FRONTEND + '\n' + content;
+
+					const compressed = fflate.gzipSync(Buffer.from(content, 'utf-8'), { level: 9 });
+
+					compressed[0x4] = 0;
+					compressed[0x5] = 0;
+					compressed[0x6] = 0;
+					compressed[0x7] = 0;
+					compressed[0x9] = 0;
+
+					if (compressed.byteLength > MAX_BUNDLE_SIZE) {
+						throw new Error(
+							`Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` +
+								`Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.ts.\n`
+						);
+					}
+
+					writeFileSync(gzipPath, compressed);
+					console.log('✓ Created index.html.gz');
+				} catch (error) {
+					console.error('Failed to create gzip file:', error);
+				}
+			}, 100);
+		}
+	};
+}
+
+export default defineConfig({
+	resolve: {
+		alias: {
+			'katex-fonts': resolve('node_modules/katex/dist/fonts')
+		}
+	},
+	build: {
+		assetsInlineLimit: MAX_ASSET_SIZE,
+		chunkSizeWarningLimit: 3072,
+		minify: ENABLE_JS_MINIFICATION
+	},
+	css: {
+		preprocessorOptions: {
+			scss: {
+				additionalData: `
+					$use-woff2: true;
+					$use-woff: false;
+					$use-ttf: false;
+				`
+			}
+		}
+	},
+	plugins: [tailwindcss(), sveltekit(), devtoolsJson(), llamaCppBuildPlugin()],
+	test: {
+		projects: [
+			{
+				extends: './vite.config.ts',
+				test: {
+					name: 'client',
+					environment: 'browser',
+					browser: {
+						enabled: true,
+						provider: 'playwright',
+						instances: [{ browser: 'chromium' }]
+					},
+					include: ['tests/client/**/*.svelte.{test,spec}.{js,ts}'],
+					setupFiles: ['./vitest-setup-client.ts']
+				}
+			},
+			{
+				extends: './vite.config.ts',
+				test: {
+					name: 'unit',
+					environment: 'node',
+					include: ['tests/unit/**/*.{test,spec}.{js,ts}']
+				}
+			},
+			{
+				extends: './vite.config.ts',
+				test: {
+					name: 'ui',
+					environment: 'browser',
+					browser: {
+						enabled: true,
+						provider: 'playwright',
+						instances: [{ browser: 'chromium', headless: true }]
+					},
+					include: ['tests/stories/**/*.stories.{js,ts,svelte}'],
+					setupFiles: ['./.storybook/vitest.setup.ts']
+				},
+				plugins: [
+					storybookTest({
+						storybookScript: 'pnpm run storybook --no-open'
+					})
+				]
+			}
+		]
+	},
+
+	server: {
+		proxy: {
+			'/v1': 'http://localhost:8080',
+			'/props': 'http://localhost:8080',
+			'/models': 'http://localhost:8080'
+		},
+		headers: {
+			'Cross-Origin-Embedder-Policy': 'require-corp',
+			'Cross-Origin-Opener-Policy': 'same-origin'
+		}
+	}
+});
diff --git a/llama.cpp/tools/server/webui/vitest-setup-client.ts b/llama.cpp/tools/server/webui/vitest-setup-client.ts
new file mode 100644
index 0000000..570b9f0
--- /dev/null
+++ b/llama.cpp/tools/server/webui/vitest-setup-client.ts
@@ -0,0 +1,2 @@
+/// <reference types="@vitest/browser/matchers" />
+/// <reference types="@vitest/browser/providers/playwright" />
diff --git a/llama.cpp/tools/tokenize/CMakeLists.txt b/llama.cpp/tools/tokenize/CMakeLists.txt
new file mode 100644
index 0000000..feed9a1
--- /dev/null
+++ b/llama.cpp/tools/tokenize/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(TARGET llama-tokenize)
+add_executable(${TARGET} tokenize.cpp)
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/llama.cpp/tools/tokenize/tokenize.cpp b/llama.cpp/tools/tokenize/tokenize.cpp
new file mode 100644
index 0000000..7375759
--- /dev/null
+++ b/llama.cpp/tools/tokenize/tokenize.cpp
@@ -0,0 +1,416 @@
+#include "common.h"
+//#include "log.h" // TODO: start using log.h
+#include "llama.h"
+
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <iostream> // TODO: remove me
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <shellapi.h>   // For CommandLineToArgvW
+#endif
+
+static void print_usage_information(const char * argv0) {
+    printf("usage: %s [options]\n\n", argv0);
+    printf("The tokenize program tokenizes a prompt using a given model,\n");
+    printf("and prints the resulting tokens to standard output.\n\n");
+    printf("It needs a model file, a prompt, and optionally other flags\n");
+    printf("to control the behavior of the tokenizer.\n\n");
+    printf("    The possible options are:\n");
+    printf("\n");
+    printf("    -h, --help                           print this help and exit\n");
+    printf("    -m MODEL_PATH, --model MODEL_PATH    path to model.\n");
+    printf("    --ids                                if given, only print numerical token IDs, and not token strings.\n");
+    printf("                                         The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
+    printf("    -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
+    printf("    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
+    printf("    --stdin                              read prompt from standard input.\n");
+    printf("    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
+    printf("    --no-escape                          do not escape input (such as \\n, \\t, etc.).\n");
+    printf("    --no-parse-special                   do not parse control tokens.\n");
+    printf("    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
+    printf("    --show-count                         print the total number of tokens.\n");
+}
+
+static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) text;
+    (void) user_data;
+}
+
+static std::string read_prompt_from_file(const char * filepath, bool & success) {
+    success = false;
+
+    std::ifstream in(filepath, std::ios::binary);
+    if (!in) {
+        fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno));
+        return std::string();
+    }
+    // do not assume the file is seekable (e.g. /dev/stdin)
+    std::stringstream buffer;
+    buffer << in.rdbuf();
+    if (in.fail()) {
+        fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno));
+        return std::string();
+    }
+
+    success = true;
+    return buffer.str();
+}
+
+//
+// Function: ingest_args(...) -> vector<string>
+//
+//  Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded
+//  strings, as an STL vector<string>.
+//
+//  In particular, it handles character encoding shenanigans on Windows.
+//
+// Note: raw_argc and raw_argv are not actually read at all on Windows.
+//       On Windows we call GetCommandLineW to get the arguments in wchar_t
+//       format, ignoring the regular argc/argv arguments to main().
+//
+// TODO: potential opportunity to roll common stuff into common/console.cpp
+//       in relation to Windows wchar_t shenanigans.
+static std::vector<std::string> ingest_args(int raw_argc, char ** raw_argv) {
+    std::vector<std::string> argv;
+
+    // Handle Windows, if given non-ASCII arguments.
+    // We convert wchar_t arguments into UTF-8 char* on this platform.
+    // Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters
+    // without throwing tantrums.
+#if defined(_WIN32)
+    int argc;
+    const LPWSTR cmdline_wargv = GetCommandLineW();
+    LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc);
+
+    // silence unused arg warnings
+    (void) raw_argc;
+    (void) raw_argv;
+
+    for (int i = 0; i < argc; ++i) {
+        int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL);
+        char * output_buf = (char *) calloc(length_needed+1, sizeof(char));
+        GGML_ASSERT(output_buf);
+
+        WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL);
+        output_buf[length_needed] = '\0';
+
+        argv.push_back(output_buf);
+        free(output_buf);
+    }
+
+    LocalFree((HLOCAL) wargv);
+#else
+    int argc = raw_argc;
+    for (int i = 0; i < argc; ++i) {
+        argv.push_back(raw_argv[i]);
+    }
+#endif
+
+    GGML_ASSERT((unsigned int) argc == argv.size());
+
+    return argv;
+}
+
+//
+// Function: write_utf8_cstr_to_stdout(const char *) -> <writes to stdout>
+//
+// writes a string to standard output; taking into account that on Windows
+// to display correctly you have to use special handling. Works even if the
+// user has not set a unicode code page on a Windows cmd.exe.
+//
+// In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something
+// a human-readable is written instead.
+//
+// On non-Windows systems, simply printfs() the string.
+static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
+        invalid_utf8 = false;
+
+#if defined(_WIN32)
+        // Are we in a console?
+        HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
+        DWORD dwMode = 0;
+
+        // According to Microsoft docs:
+        // "WriteConsole fails if it is used with a standard handle that is redirected to a file."
+        // Also according to the docs, you can use GetConsoleMode to check for that.
+        if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
+            printf("%s", str);
+            return;
+        }
+
+        // MultiByteToWideChar reports an error if str is empty, don't report
+        // them as invalid_utf8.
+        if (*str == 0) {
+            return;
+        }
+        int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0);
+        if (length_needed == 0) {
+            DWORD err = GetLastError();
+            if (err == ERROR_NO_UNICODE_TRANSLATION) {
+                invalid_utf8 = true;
+                int len = strlen(str);
+                printf("<");
+                for (int i = 0; i < len; ++i) {
+                    if (i > 0) {
+                        printf(" ");
+                    }
+                    printf("%02x", (uint8_t) str[i]);
+                }
+                printf(">");
+                return;
+            }
+            GGML_ABORT("MultiByteToWideChar() failed in an unexpected way.");
+        }
+
+        LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
+        GGML_ASSERT(wstr);
+
+        MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed);
+        WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL);
+
+        free(wstr);
+#else
+        // TODO: reporting invalid_utf8 would be useful on non-Windows too.
+        // printf will silently just write bad unicode.
+        printf("%s", str);
+#endif
+}
+
+int main(int raw_argc, char ** raw_argv) {
+    const std::vector<std::string> argv = ingest_args(raw_argc, raw_argv);
+    const int argc = argv.size();
+
+    if (argc <= 1) {
+        print_usage_information(argv[0].c_str());
+        return 1;
+    }
+
+    //////
+    // Read out all the command line arguments.
+    //////
+
+    // variables where to put any arguments we see.
+    bool printing_ids = false;
+    bool no_bos = false;
+    bool no_escape = false;
+    bool no_parse_special = false;
+    bool disable_logging = false;
+    bool show_token_count = false;
+    const char * model_path = NULL;
+    const char * prompt_path = NULL;
+    const char * prompt_arg = NULL;
+
+    // track which arguments were explicitly given
+    // used for sanity checking down the line
+    bool model_path_set = false;
+    bool prompt_path_set = false;
+    bool prompt_set = false;
+    bool stdin_set = false;
+
+    int iarg = 1;
+    for (; iarg < argc; ++iarg) {
+        std::string arg{argv[iarg]};
+        if (arg == "-h" || arg == "--help") {
+            print_usage_information(argv[0].c_str());
+            return 0;
+        }
+        else if (arg == "--ids") {
+            printing_ids = true;
+        }
+        else if (arg == "-m" || arg == "--model") {
+            if (model_path_set) {
+                fprintf(stderr, "Error: -m or --model specified multiple times.\n");
+                return 1;
+            }
+            model_path = argv[++iarg].c_str();
+            model_path_set = true;
+        }
+        else if (arg == "--no-bos") {
+            no_bos = true;
+        }
+        else if (arg == "--no-escape") {
+            no_escape = true;
+        }
+        else if (arg == "--no-parse-special") {
+            no_parse_special = true;
+        }
+        else if (arg == "-p" || arg == "--prompt") {
+            if (prompt_set) {
+                fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
+                return 1;
+            }
+            prompt_arg = argv[++iarg].c_str();
+            prompt_set = true;
+        }
+        else if (arg == "-f" || arg == "--file") {
+            if (prompt_path_set) {
+                fprintf(stderr, "Error: -f or --file specified multiple times.\n");
+                return 1;
+            }
+            prompt_path = argv[++iarg].c_str();
+            prompt_path_set = true;
+        }
+        else if (arg == "--stdin") {
+            stdin_set = true;
+        }
+        else if (arg == "--log-disable") {
+            disable_logging = true;
+        }
+        else if (arg == "--show-count") {
+            show_token_count = true;
+        }
+        else {
+            fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
+            return 1;
+        }
+    }
+
+    //////
+    // Sanity check the command line arguments.
+    //////
+
+    // Check that we have the required stuff set.
+    if (model_path_set && model_path == NULL) {
+        fprintf(stderr, "Error: --model requires an argument.\n");
+        return 1;
+    }
+    if (!model_path_set) {
+        fprintf(stderr, "Error: must specify --model.\n");
+        return 1;
+    }
+    if (prompt_path_set && prompt_path == NULL) {
+        fprintf(stderr, "Error: --file requires an argument.\n");
+        return 1;
+    }
+    if (prompt_set && prompt_arg == NULL) {
+        fprintf(stderr, "Error: --prompt requires an argument.\n");
+        return 1;
+    }
+    const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set);
+    if (prompts_set > 1) {
+        fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n");
+        return 1;
+    }
+    // Must have some prompt.
+    if (prompts_set == 0) {
+        fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n");
+        return 1;
+    }
+
+    GGML_ASSERT(model_path);
+    GGML_ASSERT(prompt_path || prompt_arg || stdin_set);
+
+    //////
+    // Figure out where will the prompt come from.
+    //////
+
+    std::string prompt;
+    if (prompt_path_set) {
+        bool success = false;
+        prompt = read_prompt_from_file(prompt_path, success);
+        if (!success) {
+            return 1;
+        }
+    } else if (prompt_set) {
+        prompt = prompt_arg;
+    } else {
+        GGML_ASSERT(stdin_set);
+        // we read stdin *after* loading model (early exit if model cannot
+        // be loaded, which can be a nicer user experience)
+    }
+
+    //////
+    // Start actually doing the tokenizing stuff.
+    //////
+
+    if (disable_logging) {
+        llama_log_set(llama_log_callback_null, NULL);
+    }
+
+    llama_backend_init();
+
+    llama_model_params model_params = llama_model_default_params();
+    model_params.vocab_only = true;
+    llama_model * model = llama_model_load_from_file(model_path, model_params);
+    if (!model) {
+        fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
+        return 1;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    llama_context_params ctx_params = llama_context_default_params();
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    if (!ctx) {
+        fprintf(stderr, "Error: could not create context.\n");
+        return 1;
+    }
+
+    // read entire prompt from stdin?
+    if (stdin_set) {
+        GGML_ASSERT(!prompt_path_set && !prompt_set);
+
+        std::stringstream stdin_buffer;
+        stdin_buffer << std::cin.rdbuf();
+        if (std::cin.fail()) {
+            fprintf(stderr, "Error: could not read the entire standard input.\n");
+            return 1;
+        }
+
+        prompt = stdin_buffer.str();
+    }
+
+    const bool model_wants_add_bos = llama_vocab_get_add_bos(vocab);
+    const bool add_bos = model_wants_add_bos && !no_bos;
+    const bool parse_special = !no_parse_special;
+    const bool escape = !no_escape;
+
+    if (escape) {
+        string_process_escapes(prompt);
+    }
+
+    std::vector<llama_token> tokens;
+    tokens = common_tokenize(vocab, prompt, add_bos, parse_special);
+
+    if (printing_ids) {
+        printf("[");
+    }
+
+    for (int i = 0; i < (int) tokens.size(); i++) {
+        if (printing_ids) {
+            if (i > 0) {
+                printf(", ");
+            }
+            printf("%d", tokens[i]);
+        } else {
+            bool invalid_utf8 = false;
+            printf("%6d -> '", tokens[i]);
+            write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
+            if (invalid_utf8) {
+                printf("' (utf-8 decode failure)\n");
+            } else {
+                printf("'\n");
+            }
+        }
+    }
+
+    if (printing_ids) {
+        printf("]\n");
+    }
+
+    if (show_token_count) {
+        printf("Total number of tokens: %zu\n", tokens.size());
+    }
+    // silence valgrind
+    llama_free(ctx);
+    llama_model_free(model);
+
+    return 0;
+}
diff --git a/llama.cpp/tools/tts/CMakeLists.txt b/llama.cpp/tools/tts/CMakeLists.txt
new file mode 100644
index 0000000..76320d4
--- /dev/null
+++ b/llama.cpp/tools/tts/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-tts)
+add_executable(${TARGET} tts.cpp)
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/llama.cpp/tools/tts/README.md b/llama.cpp/tools/tts/README.md
new file mode 100644
index 0000000..4749bb9
--- /dev/null
+++ b/llama.cpp/tools/tts/README.md
@@ -0,0 +1,117 @@
+# llama.cpp/example/tts
+This example demonstrates the Text To Speech feature. It uses a
+[model](https://www.outeai.com/blog/outetts-0.2-500m) from
+[outeai](https://www.outeai.com/).
+
+## Quickstart
+If you have built llama.cpp with SSL support you can simply run the
+following command and the required models will be downloaded automatically:
+```console
+$ build/bin/llama-tts --tts-oute-default -p "Hello world" && aplay output.wav
+```
+For details about the models and how to convert them to the required format
+see the following sections.
+
+### Model conversion
+Checkout or download the model that contains the LLM model:
+```console
+$ pushd models
+$ git clone --branch main --single-branch --depth 1 https://huggingface.co/OuteAI/OuteTTS-0.2-500M
+$ cd OuteTTS-0.2-500M && git lfs install && git lfs pull
+$ popd
+```
+Convert the model to .gguf format:
+```console
+(venv) python convert_hf_to_gguf.py models/OuteTTS-0.2-500M \
+    --outfile models/outetts-0.2-0.5B-f16.gguf --outtype f16
+```
+The generated model will be `models/outetts-0.2-0.5B-f16.gguf`.
+
+We can optionally quantize this to Q8_0 using the following command:
+```console
+$ build/bin/llama-quantize models/outetts-0.2-0.5B-f16.gguf \
+    models/outetts-0.2-0.5B-q8_0.gguf q8_0
+```
+The quantized model will be `models/outetts-0.2-0.5B-q8_0.gguf`.
+
+Next we do something similar for the audio decoder. First download or checkout
+the model for the voice decoder:
+```console
+$ pushd models
+$ git clone --branch main --single-branch --depth 1 https://huggingface.co/novateur/WavTokenizer-large-speech-75token
+$ cd WavTokenizer-large-speech-75token && git lfs install && git lfs pull
+$ popd
+```
+This model file is a PyTorch checkpoint (.ckpt) and we first need to convert it to
+huggingface format:
+```console
+(venv) python tools/tts/convert_pt_to_hf.py \
+    models/WavTokenizer-large-speech-75token/wavtokenizer_large_speech_320_24k.ckpt
+...
+Model has been successfully converted and saved to models/WavTokenizer-large-speech-75token/model.safetensors
+Metadata has been saved to models/WavTokenizer-large-speech-75token/index.json
+Config has been saved to models/WavTokenizer-large-speech-75tokenconfig.json
+```
+Then we can convert the huggingface format to gguf:
+```console
+(venv) python convert_hf_to_gguf.py models/WavTokenizer-large-speech-75token \
+    --outfile models/wavtokenizer-large-75-f16.gguf --outtype f16
+...
+INFO:hf-to-gguf:Model successfully exported to models/wavtokenizer-large-75-f16.gguf
+```
+
+### Running the example
+
+With both of the models generated, the LLM model and the voice decoder model,
+we can run the example:
+```console
+$ build/bin/llama-tts -m  ./models/outetts-0.2-0.5B-q8_0.gguf \
+    -mv ./models/wavtokenizer-large-75-f16.gguf \
+    -p "Hello world"
+...
+main: audio written to file 'output.wav'
+```
+The output.wav file will contain the audio of the prompt. This can be heard
+by playing the file with a media player. On Linux the following command will
+play the audio:
+```console
+$ aplay output.wav
+```
+
+### Running the example with llama-server
+Running this example with `llama-server` is also possible and requires two
+server instances to be started. One will serve the LLM model and the other
+will serve the voice decoder model.
+
+The LLM model server can be started with the following command:
+```console
+$ ./build/bin/llama-server -m ./models/outetts-0.2-0.5B-q8_0.gguf --port 8020
+```
+
+And the voice decoder model server can be started using:
+```console
+./build/bin/llama-server -m ./models/wavtokenizer-large-75-f16.gguf --port 8021 --embeddings --pooling none
+```
+
+Then we can run [tts-outetts.py](tts-outetts.py) to generate the audio.
+
+First create a virtual environment for python and install the required
+dependencies (this in only required to be done once):
+```console
+$ python3 -m venv venv
+$ source venv/bin/activate
+(venv) pip install requests numpy
+```
+
+And then run the python script using:
+```conole
+(venv) python ./tools/tts/tts-outetts.py http://localhost:8020 http://localhost:8021 "Hello world"
+spectrogram generated: n_codes: 90, n_embd: 1282
+converting to audio ...
+audio generated: 28800 samples
+audio written to file "output.wav"
+```
+And to play the audio we can again use aplay or any other media player:
+```console
+$ aplay output.wav
+```
diff --git a/llama.cpp/tools/tts/convert_pt_to_hf.py b/llama.cpp/tools/tts/convert_pt_to_hf.py
new file mode 100644
index 0000000..ebd55d9
--- /dev/null
+++ b/llama.cpp/tools/tts/convert_pt_to_hf.py
@@ -0,0 +1,180 @@
+# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format
+# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder
+#
+# TODO: this script is LLM-generated and probably very inefficient and should be rewritten
+
+import torch
+import json
+import os
+import sys
+import re
+
+from safetensors.torch import save_file
+
+# default
+model_path = './model.pt'
+
+# read from CLI
+if len(sys.argv) > 1:
+    model_path = sys.argv[1]
+
+# get the directory of the input model
+path_dst = os.path.dirname(model_path)
+
+print(f"Loading model from {model_path}")
+
+model = torch.load(model_path, map_location='cpu')
+
+#print(model)
+
+# print all keys
+for key in model.keys():
+    print(key)
+    if key == 'hyper_parameters':
+        #print(model[key])
+        # dump as json pretty
+        print(json.dumps(model[key], indent=4))
+    #if key != 'state_dict' and key != 'optimizer_states':
+    #    print(model[key])
+
+# Check if the loaded model is a state_dict or a model instance
+if isinstance(model, torch.nn.Module):
+    state_dict = model.state_dict()
+else:
+    state_dict = model
+
+# Print the structure of the state_dict to understand its format
+print("State dictionary keys:")
+for key in state_dict.keys():
+    print(key)
+
+# Ensure the state_dict is flat and contains only torch.Tensor objects
+def flatten_state_dict(state_dict, parent_key='', sep='.'):
+    items = []
+    items_new = []
+
+    for k, v in state_dict.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, torch.Tensor):
+            items.append((new_key, v))
+        elif isinstance(v, dict):
+            items.extend(flatten_state_dict(v, new_key, sep=sep).items())
+            return dict(items)
+
+    size_total_mb = 0
+
+    for key, value in list(items):
+        # keep only what we need for inference
+        if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \
+           not key.startswith('state_dict.backbone.') and \
+           not key.startswith('state_dict.head.out'):
+               print('Skipping key: ', key)
+               continue
+
+        new_key = key
+
+        new_key = new_key.replace('state_dict.', '')
+        new_key = new_key.replace('pos_net', 'posnet')
+
+        # check if matches "backbone.posnet.%d.bias" or "backbone.posnet.%d.weight"
+        if new_key.startswith("backbone.posnet."):
+            match = re.match(r"backbone\.posnet\.(\d+)\.(bias|weight)", new_key)
+            if match:
+               new_key = f"backbone.posnet.{match.group(1)}.norm.{match.group(2)}"
+
+        # "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" -> "backbone.embedding.weight"
+        if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed":
+            new_key = "backbone.embedding.weight"
+
+        # these are the only rows used
+        # ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100
+        if new_key.endswith("norm.scale.weight"):
+            new_key = new_key.replace("norm.scale.weight", "norm.weight")
+            value = value[0]
+
+        if new_key.endswith("norm.shift.weight"):
+            new_key = new_key.replace("norm.shift.weight", "norm.bias")
+            value = value[0]
+
+        if new_key.endswith("gamma"):
+            new_key = new_key.replace("gamma", "gamma.weight")
+
+        # convert from 1D [768] to 2D [768, 1] so that ggml_add can broadcast the bias
+        if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.posnet") or new_key.startswith("backbone.embed.bias")):
+            value = value.unsqueeze(1)
+
+        if new_key.endswith("dwconv.bias"):
+            value = value.unsqueeze(1)
+
+        size_mb = value.element_size() * value.nelement() / (1024 * 1024)
+        print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
+
+        size_total_mb += size_mb
+
+        #print(key, '->', new_key, ': ', value)
+        #print(key, '->', new_key)
+
+        items_new.append((new_key, value))
+
+    print(f"Total size: {size_total_mb:8.2f} MB")
+
+    return dict(items_new)
+
+flattened_state_dict = flatten_state_dict(state_dict)
+
+
+# Convert the model to the safetensors format
+output_path = path_dst + '/model.safetensors'
+save_file(flattened_state_dict, output_path)
+
+print(f"Model has been successfully converted and saved to {output_path}")
+
+# Calculate the total size of the .safetensors file
+total_size = os.path.getsize(output_path)
+
+# Create the weight map
+weight_map = {
+    "model.safetensors": ["*"]  # Assuming all weights are in one file
+}
+
+# Create metadata for the index.json file
+metadata = {
+    "total_size": total_size,
+    "weight_map": weight_map
+}
+
+# Save the metadata to index.json
+index_path = path_dst + '/index.json'
+with open(index_path, 'w') as f:
+    json.dump(metadata, f, indent=4)
+
+print(f"Metadata has been saved to {index_path}")
+
+config = {
+    "architectures": [
+        "WavTokenizerDec"
+    ],
+    "hidden_size": 1282,
+    "n_embd_features": 512,
+    "n_ff": 2304,
+    "vocab_size": 4096,
+    "n_head": 1,
+    "layer_norm_epsilon": 1e-6,
+    "group_norm_epsilon": 1e-6,
+    "group_norm_groups": 32,
+    "max_position_embeddings": 8192, # ?
+    "n_layer": 12,
+    "posnet": {
+        "n_embd": 768,
+        "n_layer": 6
+    },
+    "convnext": {
+        "n_embd": 768,
+        "n_layer": 12
+    },
+}
+
+with open(path_dst + '/config.json', 'w') as f:
+    json.dump(config, f, indent=4)
+
+print(f"Config has been saved to {path_dst + 'config.json'}")
diff --git a/llama.cpp/tools/tts/tts-outetts.py b/llama.cpp/tools/tts/tts-outetts.py
new file mode 100644
index 0000000..3791f9f
--- /dev/null
+++ b/llama.cpp/tools/tts/tts-outetts.py
@@ -0,0 +1,299 @@
+import sys
+#import json
+#import struct
+import requests
+import re
+import struct
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+
+
+def fill_hann_window(size, periodic=True):
+    if periodic:
+        return np.hanning(size + 1)[:-1]
+    return np.hanning(size)
+
+
+def irfft(n_fft, complex_input):
+    return np.fft.irfft(complex_input, n=n_fft)
+
+
+def fold(buffer, n_out, n_win, n_hop, n_pad):
+    result = np.zeros(n_out)
+    n_frames = len(buffer) // n_win
+
+    for i in range(n_frames):
+        start = i * n_hop
+        end = start + n_win
+        result[start:end] += buffer[i * n_win:(i + 1) * n_win]
+
+    return result[n_pad:-n_pad] if n_pad > 0 else result
+
+
+def process_frame(args):
+    l, n_fft, ST, hann = args
+    frame = irfft(n_fft, ST[l])
+    frame = frame * hann
+    hann2 = hann * hann
+    return frame, hann2
+
+
+def embd_to_audio(embd, n_codes, n_embd, n_thread=4):
+    embd = np.asarray(embd, dtype=np.float32).reshape(n_codes, n_embd)
+
+    n_fft = 1280
+    n_hop = 320
+    n_win = 1280
+    n_pad = (n_win - n_hop) // 2
+    n_out = (n_codes - 1) * n_hop + n_win
+
+    hann = fill_hann_window(n_fft, True)
+
+    E = np.zeros((n_embd, n_codes), dtype=np.float32)
+    for l in range(n_codes):
+        for k in range(n_embd):
+            E[k, l] = embd[l, k]
+
+    half_embd = n_embd // 2
+    S = np.zeros((n_codes, half_embd + 1), dtype=np.complex64)
+
+    for k in range(half_embd):
+        for l in range(n_codes):
+            mag = E[k, l]
+            phi = E[k + half_embd, l]
+
+            mag = np.clip(np.exp(mag), 0, 1e2)
+            S[l, k] = mag * np.exp(1j * phi)
+
+    res = np.zeros(n_codes * n_fft)
+    hann2_buffer = np.zeros(n_codes * n_fft)
+
+    with ThreadPoolExecutor(max_workers=n_thread) as executor:
+        args = [(l, n_fft, S, hann) for l in range(n_codes)]
+        results = list(executor.map(process_frame, args))
+
+        for l, (frame, hann2) in enumerate(results):
+            res[l*n_fft:(l+1)*n_fft] = frame
+            hann2_buffer[l*n_fft:(l+1)*n_fft] = hann2
+
+    audio = fold(res, n_out, n_win, n_hop, n_pad)
+    env = fold(hann2_buffer, n_out, n_win, n_hop, n_pad)
+
+    mask = env > 1e-10
+    audio[mask] /= env[mask]
+
+    return audio
+
+
+def save_wav(filename, audio_data, sample_rate):
+    num_channels = 1
+    bits_per_sample = 16
+    bytes_per_sample = bits_per_sample // 8
+    data_size = len(audio_data) * bytes_per_sample
+    byte_rate = sample_rate * num_channels * bytes_per_sample
+    block_align = num_channels * bytes_per_sample
+    chunk_size = 36 + data_size  # 36 = size of header minus first 8 bytes
+
+    header = struct.pack(
+        '<4sI4s4sIHHIIHH4sI',
+        b'RIFF',
+        chunk_size,
+        b'WAVE',
+        b'fmt ',
+        16,                # fmt chunk size
+        1,                 # audio format (PCM)
+        num_channels,
+        sample_rate,
+        byte_rate,
+        block_align,
+        bits_per_sample,
+        b'data',
+        data_size
+    )
+
+    audio_data = np.clip(audio_data * 32767, -32768, 32767)
+    pcm_data = audio_data.astype(np.int16)
+
+    with open(filename, 'wb') as f:
+        f.write(header)
+        f.write(pcm_data.tobytes())
+
+
+def process_text(text: str):
+    text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed
+    text = re.sub(r'[-_/,\.\\]', ' ', text)
+    text = re.sub(r'[^a-z\s]', '', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text.split()
+
+# usage:
+# python tts-outetts.py http://server-llm:port http://server-dec:port "text"
+
+if len(sys.argv) <= 3:
+    print("usage: python tts-outetts.py http://server-llm:port http://server-dec:port \"text\"")
+    exit(1)
+
+host_llm = sys.argv[1]
+host_dec = sys.argv[2]
+text = sys.argv[3]
+
+prefix = """<|im_start|>
+<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>"""
+
+words = process_text(text)
+words = "<|text_sep|>".join([i.strip() for i in words])
+words += "<|text_end|>\n"
+
+# voice data
+# TODO: load from json
+#suffix = """<|audio_start|>
+#the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
+#overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
+#package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
+#from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|>
+#just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|>
+#two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|>
+#people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|>
+#is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|>
+#pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|>
+#remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|>
+#sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|>
+#i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|>
+#have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|>
+#some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|>
+#critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|>
+#about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|>
+#some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|>
+#of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|>
+#the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|>
+#gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|>
+#aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|>
+#but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|>
+#its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|>
+#still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|>
+#really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|>
+#enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|>
+#and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|>
+#it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
+#looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
+#lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>"""
+
+# TODO: tokenization is slow for some reason - here is pre-tokenized input
+suffix = [ 151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585, 152460, 153375, 151670, 198, 74455,
+          155808, 151669, 151799, 151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470, 151970, 153413,
+          152419, 153334, 153289, 153374, 153199, 152040, 153260, 152721, 152680, 153297, 152419, 153248, 152400,
+          152691, 153368, 153437, 151670, 198, 1722, 155828, 151669, 152607, 152256, 152991, 152299, 152688, 153163,
+          153016, 152789, 153198, 152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207, 152461, 153321,
+          153309, 151750, 152137, 153340, 152573, 152267, 153347, 151789, 152681, 153339, 151992, 152512, 151751,
+          152179, 153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904, 152311, 151670, 198, 1499, 155791,
+          151669, 152276, 152454, 153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226, 153043, 152325,
+          153267, 152622, 151670, 198, 4250, 155797, 151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271,
+          152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213, 152112, 153204, 151722, 152542, 151670, 198,
+          19789, 155796, 151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002, 152191, 151734, 152312, 152810,
+          152237, 153224, 153169, 153224, 152244, 153387, 153404, 151670, 198, 16069, 155811, 151669, 152265, 151946,
+          151808, 152412, 152363, 152305, 153156, 152733, 152810, 153157, 152016, 152100, 152069, 153234, 152317,
+          152589, 152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504, 153376, 152272, 152433, 152325,
+          151941, 151670, 198, 285, 155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381, 152474, 152680,
+          152157, 153255, 152324, 151682, 151670, 198, 32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682,
+          152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488, 153070, 151883, 152890, 152489, 153144,
+          153375, 152358, 151685, 152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669, 151902, 152720,
+          153377, 152027, 152378, 152821, 153207, 153459, 153028, 153068, 152507, 153255, 152158, 152921, 151958,
+          152609, 152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470, 152606, 152162, 152186, 153071,
+          152244, 153118, 153375, 153018, 152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736, 153380,
+          153502, 152702, 152115, 153181, 152735, 153277, 153457, 152393, 153112, 152595, 151670, 198, 19098, 155808,
+          151669, 152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239, 153163, 152922, 153402, 152034,
+          152591, 153438, 152215, 151673, 152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482, 152718,
+          152862, 153347, 151670, 198, 72, 155780, 151669, 151795, 152111, 152746, 152377, 153471, 152309, 151670, 198,
+          19016, 155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701, 152939, 152536, 152091, 151815, 152733,
+          151672, 151670, 198, 14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042, 153504, 152589, 153333,
+          151839, 151941, 153038, 153180, 151670, 198, 36996, 8303, 155832, 151669, 152231, 152256, 152835, 152801,
+          152985, 153400, 152393, 152818, 152765, 152249, 152600, 151699, 152302, 152752, 153018, 153009, 151992,
+          153054, 152847, 153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458, 152048, 152757, 152428,
+          153195, 151906, 153006, 153178, 153250, 152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418,
+          152228, 152733, 151670, 198, 9096, 155801, 151669, 151698, 153321, 152217, 153039, 152935, 153400, 152122,
+          152531, 153106, 152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851, 152901, 152885, 152594,
+          153446, 153080, 151670, 198, 14689, 155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191, 151673,
+          151690, 151698, 152714, 152846, 152981, 153171, 153384, 153364, 153188, 153246, 151670, 198, 1055, 155779,
+          151669, 151869, 152388, 152711, 153334, 151736, 151670, 198, 1782, 155780, 151669, 153483, 153240, 152241,
+          152558, 152697, 153046, 151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605, 153034, 153434,
+          153372, 153347, 151887, 152453, 152758, 152133, 152510, 152694, 152431, 152321, 153088, 152676, 152223,
+          152581, 152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032, 152903, 152859, 152989, 151748,
+          152669, 152661, 152650, 152409, 151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469, 152988,
+          152894, 151819, 152391, 153019, 152058, 153062, 153230, 151826, 152112, 152306, 152264, 152769, 153390,
+          152384, 152435, 152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540, 151919, 151893, 152558,
+          152817, 152946, 152956, 152129, 152715, 153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450,
+          151670, 198, 8088, 155792, 151669, 152452, 153497, 153353, 152679, 152533, 152382, 152374, 152611, 153341,
+          153163, 152285, 153411, 152495, 153141, 152320, 151670, 198, 1199, 155781, 151669, 151764, 152360, 153295,
+          152634, 153342, 152199, 152271, 151670, 198, 43366, 155799, 151669, 152308, 151682, 152889, 152016, 152385,
+          152629, 152495, 151826, 153321, 152958, 152180, 151886, 153432, 152922, 152128, 153024, 153040, 152593,
+          152287, 151677, 151670, 198, 53660, 155808, 151669, 151727, 152092, 152680, 153331, 151699, 152316, 152938,
+          152289, 152433, 153384, 151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691, 152489, 151941,
+          152049, 152034, 153053, 152179, 153160, 151676, 153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350,
+          152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234, 153135, 152291, 153235, 152143, 152583,
+          152402, 153483, 152678, 152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825, 152548, 153442,
+          152109, 152659, 153325, 152781, 152570, 152957, 151752, 152265, 153381, 152515, 151670, 198, 437, 155787,
+          151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174, 151792, 153409, 153327, 152990, 151670, 198,
+          275, 155781, 151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974, 151670, 198, 94273, 155799,
+          151669, 152953, 152938, 153427, 152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331, 152257,
+          152987, 152777, 153448, 152408, 151696, 152408, 152326, 152699, 151670, 198, 385, 16239, 155828, 151669,
+          152306, 152268, 153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110, 152918, 152923, 152467,
+          152331, 153053, 153330, 151889, 153444, 152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751,
+          152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499, 152109, 152255, 151739, 152267, 152759,
+          153318, 153165, 153349, 151670, ]
+
+response = requests.post(
+    host_llm + "/completion",
+    json={
+        "prompt": [prefix + words, *suffix],
+        "n_predict": 1024,
+        "cache_prompt": True,
+        "return_tokens": True,
+        "samplers": ["top_k"],
+        "top_k": 16,
+        "seed": 1003,
+    }
+)
+
+response_json = response.json()
+
+#print(json.dumps(response_json, indent=4))
+#print(json.dumps(response_json["prompt"], indent=4).replace("\\n", "\n"))
+#print(json.dumps(response_json["timings"], indent=4))
+#print(json.dumps(response_json["tokens"], indent=4))
+
+codes = response_json["tokens"]
+
+codes = [t - 151672 for t in codes if t >= 151672 and t <= 155772]
+
+response = requests.post(
+    host_dec + "/embeddings",
+    json={
+        "input": [*codes],
+    }
+)
+
+response_json = response.json()
+
+#print(json.dumps(response_json, indent=4))
+
+# spectrogram
+embd = response_json[0]["embedding"]
+
+n_codes = len(embd)
+n_embd = len(embd[0])
+
+print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd))
+
+# post-process the spectrogram to convert to audio
+print('converting to audio ...')
+audio = embd_to_audio(embd, n_codes, n_embd)
+print('audio generated: %d samples' % len(audio))
+
+filename = "output.wav"
+sample_rate = 24000 # sampling rate
+
+# zero out first 0.25 seconds
+audio[:24000 // 4] = 0.0
+
+save_wav(filename, audio, sample_rate)
+print('audio written to file "%s"' % filename)
diff --git a/llama.cpp/tools/tts/tts.cpp b/llama.cpp/tools/tts/tts.cpp
new file mode 100644
index 0000000..ac55a8b
--- /dev/null
+++ b/llama.cpp/tools/tts/tts.cpp
@@ -0,0 +1,1093 @@
+#define _USE_MATH_DEFINES // For M_PI on MSVC
+
+#include "arg.h"
+#include "common.h"
+#include "sampling.h"
+#include "log.h"
+#include "llama.h"
+
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <fstream>
+#include <map>
+#include <regex>
+#include <string>
+#include <thread>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+enum outetts_version {
+    OUTETTS_V0_2,
+    OUTETTS_V0_3,
+};
+
+//
+// Terminal utils
+//
+
+#define SQR(X)    ((X) * (X))
+#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
+
+/**
+ * Quantizes 24-bit RGB to xterm256 code range [16,256).
+ */
+static int rgb2xterm256(int r, int g, int b) {
+    unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
+    int av, ir, ig, ib, il, qr, qg, qb, ql;
+    av = r * .299 + g * .587 + b * .114 + .5;
+    ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
+    qr = cube[(ir = UNCUBE(r))];
+    qg = cube[(ig = UNCUBE(g))];
+    qb = cube[(ib = UNCUBE(b))];
+    if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
+        SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
+        return ir * 36 + ig * 6 + ib + 020;
+    return il + 0350;
+}
+
+static std::string set_xterm256_foreground(int r, int g, int b) {
+    int x = rgb2xterm256(r, g, b);
+    std::ostringstream oss;
+    oss << "\033[38;5;" << x << "m";
+    return oss.str();
+}
+
+const std::vector<std::string> k_colors = {
+    set_xterm256_foreground(220,   5,  12),
+    set_xterm256_foreground(232,  96,  28),
+    set_xterm256_foreground(241, 147,  45),
+    set_xterm256_foreground(246, 193,  65),
+    set_xterm256_foreground(247, 240,  86),
+    set_xterm256_foreground(144, 201, 135),
+    set_xterm256_foreground( 78, 178, 101),
+};
+
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -p \"Hello!\"\n", argv[0]);
+    LOG("\n");
+}
+
+struct wav_header {
+    char riff[4] = {'R', 'I', 'F', 'F'};
+    uint32_t chunk_size;
+    char wave[4] = {'W', 'A', 'V', 'E'};
+    char fmt[4] = {'f', 'm', 't', ' '};
+    uint32_t fmt_chunk_size = 16;
+    uint16_t audio_format = 1; // PCM
+    uint16_t num_channels = 1; // Mono
+    uint32_t sample_rate;
+    uint32_t byte_rate;
+    uint16_t block_align;
+    uint16_t bits_per_sample = 16;
+    char data[4] = {'d', 'a', 't', 'a'};
+    uint32_t data_size;
+};
+
+static bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
+    std::ofstream file(fname, std::ios::binary);
+    if (!file) {
+        LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str());
+        return false;
+    }
+
+    wav_header header;
+    header.sample_rate = sample_rate;
+    header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
+    header.block_align = header.num_channels * (header.bits_per_sample / 8);
+    header.data_size = data.size() * (header.bits_per_sample / 8);
+    header.chunk_size = 36 + header.data_size;
+
+    file.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+    for (const auto & sample : data) {
+        int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
+        file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
+    }
+
+    return file.good();
+}
+
+static void fill_hann_window(int length, bool periodic, float * output) {
+    int offset = -1;
+    if (periodic) {
+        offset = 0;
+    }
+    for (int i = 0; i < length; i++) {
+        output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+    }
+}
+
+// very poor-man fft
+static void twiddle(float * real, float * imag, int k, int N) {
+    float angle = 2 * M_PI * k / N;
+    *real = cos(angle);
+    *imag = sin(angle);
+}
+
+static void irfft(int n, const float * inp_cplx, float * out_real) {
+    int N = n / 2 + 1;
+
+    std::vector<float> real_input(N);
+    std::vector<float> imag_input(N);
+    for (int i = 0; i < N; ++i) {
+        real_input[i] = inp_cplx[2 * i];
+        imag_input[i] = inp_cplx[2 * i + 1];
+    }
+
+    std::vector<float> real_output(n);
+    std::vector<float> imag_output(n);
+
+    for (int k = 0; k < n; ++k) {
+        real_output[k] = 0.0f;
+        imag_output[k] = 0.0f;
+        for (int m = 0; m < N; ++m) {
+            float twiddle_real;
+            float twiddle_imag;
+
+            twiddle(&twiddle_real, &twiddle_imag, k * m, n);
+
+            real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
+            imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
+        }
+    }
+
+    for (int i = 0; i < n; ++i) {
+        out_real[i] = real_output[i] / N;
+    }
+}
+
+//
+//  y = torch.nn.functional.fold(
+//       data, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
+//  )[:, 0, 0, pad:-pad]
+//
+// data.shape =  torch.Size([1, 1280, 261])
+// output_size =  84480
+// win_length =  1280
+// hop_length =  320
+// pad =  480
+//
+static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
+    int64_t output_height = n_out;
+    int64_t kernel_w = n_win;
+    int64_t stride_w = n_hop;
+    int64_t width    = n_out;
+
+    output.resize(width, 0.0f);
+
+    int64_t col_idx = 0;
+    for (int64_t w_col = 0; w_col < width; ++w_col) {
+        int64_t start = w_col * stride_w - n_pad;
+        int64_t end   = start + kernel_w;
+
+        for (int64_t w_im = start; w_im < end; ++w_im) {
+            if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
+                output[w_im] += data[col_idx];
+            }
+            col_idx++;
+        }
+    }
+
+    output.resize(n_out - 2 * n_pad);
+}
+
+// TODO: not optimized at all
+static std::vector<float> embd_to_audio(
+        const float * embd,
+        const int n_codes,
+        const int n_embd,
+        const int n_thread) {
+    const int n_fft = 1280;
+    const int n_hop = 320;
+    const int n_win = 1280;
+    const int n_pad = (n_win - n_hop)/2;
+    const int n_out = (n_codes - 1)*n_hop + n_win;
+
+    std::vector<float> hann(n_fft);
+
+    fill_hann_window(hann.size(), true, hann.data());
+
+    int n_spec = n_embd*n_codes;
+
+    std::vector<float> E (n_spec);
+    std::vector<float> S (n_spec);
+    std::vector<float> ST(n_spec);
+
+    for (int l = 0; l < n_codes; ++l) {
+        for (int k = 0; k < n_embd; ++k) {
+            E[k*n_codes + l] = embd[l*n_embd + k];
+        }
+    }
+
+    for (int k = 0; k < n_embd/2; ++k) {
+        for (int l = 0; l < n_codes; ++l) {
+            float mag = E[(k           )*n_codes + l];
+            float phi = E[(k + n_embd/2)*n_codes + l];
+
+            mag = exp(mag);
+
+            if (mag > 1e2) {
+                mag = 1e2;
+            }
+            S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
+            S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
+        }
+    }
+
+    for (int l = 0; l < n_codes; ++l) {
+        for (int k = 0; k < n_embd/2; ++k) {
+            ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
+            ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
+        }
+    }
+
+    std::vector<float> res  (n_codes*n_fft);
+    std::vector<float> hann2(n_codes*n_fft);
+
+    std::vector<std::thread> workers(n_thread);
+    for (int i = 0; i < n_thread; ++i) {
+        workers[i] = std::thread([&, i]() {
+            for (int l = i; l < n_codes; l += n_thread) {
+                irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
+                for (int j = 0; j < n_fft; ++j) {
+                    res  [l*n_fft + j] *= hann[j];
+                    hann2[l*n_fft + j]  = hann[j] * hann[j];
+                }
+            }
+        });
+    }
+    for (int i = 0; i < n_thread; ++i) {
+        workers[i].join();
+    }
+
+    std::vector<float> audio;
+    std::vector<float> env;
+
+    fold(res,   n_out, n_win, n_hop, n_pad, audio);
+    fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
+
+    for (size_t i = 0; i < audio.size(); ++i) {
+        audio[i] /= env[i];
+    }
+
+    return audio;
+}
+
+static const std::map<int, std::string> ones = {
+    {0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"},
+    {5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"},
+    {10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"},
+    {15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}
+};
+
+static const std::map<int, std::string> tens = {
+    {2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
+    {6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}
+};
+
+// Convert a number less than 1000 to words
+static std::string convert_less_than_thousand(int num) {
+    std::string result;
+
+    if (num >= 100) {
+        result += ones.at(num / 100) + " hundred ";
+        num %= 100;
+    }
+
+    if (num >= 20) {
+        result += tens.at(num / 10);
+        if (num % 10 > 0) {
+            result += "-" + ones.at(num % 10);
+        }
+    } else if (num > 0) {
+        result += ones.at(num);
+    }
+
+    return result;
+}
+
+static std::string number_to_words(const std::string & number_str) {
+    try {
+        size_t decimal_pos = number_str.find('.');
+        std::string integer_part = number_str.substr(0, decimal_pos);
+
+        int int_number = std::stoi(integer_part);
+        std::string result;
+
+        if (int_number == 0) {
+            result = "zero";
+        } else {
+            if (int_number >= 1000000000) {
+                int billions = int_number / 1000000000;
+                result += convert_less_than_thousand(billions) + " billion ";
+                int_number %= 1000000000;
+            }
+
+            if (int_number >= 1000000) {
+                int millions = int_number / 1000000;
+                result += convert_less_than_thousand(millions) + " million ";
+                int_number %= 1000000;
+            }
+
+            if (int_number >= 1000) {
+                int thousands = int_number / 1000;
+                result += convert_less_than_thousand(thousands) + " thousand ";
+                int_number %= 1000;
+            }
+
+            if (int_number > 0) {
+                result += convert_less_than_thousand(int_number);
+            }
+        }
+
+        // Handle decimal part
+        if (decimal_pos != std::string::npos) {
+            result += " point";
+            std::string decimal_part = number_str.substr(decimal_pos + 1);
+            for (char digit : decimal_part) {
+                result += " " + ones.at(digit - '0');
+            }
+        }
+
+        return result;
+    } catch (const std::exception& e) {
+        // Skip if fails
+        return " ";
+    }
+}
+
+static std::string replace_numbers_with_words(const std::string & input_text) {
+    std::regex number_pattern(R"(\d+(\.\d+)?)");
+    std::string result;
+    auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern);
+    auto end = std::sregex_iterator();
+
+    size_t last_pos = 0;
+    for (std::sregex_iterator i = it; i != end; ++i) {
+        const std::smatch& match = *i;
+        result.append(input_text, last_pos, match.position() - last_pos);
+        result.append(number_to_words(match.str()));
+        last_pos = match.position() + match.length();
+    }
+    result.append(input_text, last_pos);
+
+    return result;
+}
+
+// Based on: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39
+static std::string process_text(const std::string & text, const outetts_version tts_version = OUTETTS_V0_2) {
+
+    // For now I skipped text romanization as I am unsure how to handle
+    // uroman and MeCab implementations in C++
+    // maybe something like https://github.com/anyascii/anyascii/ could work.
+    // currently only English would be supported in this function
+
+    std::string processed_text = replace_numbers_with_words(text);
+
+    std::transform(processed_text.begin(), processed_text.end(),
+                  processed_text.begin(), ::tolower);
+
+    std::regex special_chars(R"([-_/,\.\\])");
+    processed_text = std::regex_replace(processed_text, special_chars, " ");
+
+    std::regex non_alpha(R"([^a-z\s])");
+    processed_text = std::regex_replace(processed_text, non_alpha, "");
+
+    std::regex multiple_spaces(R"(\s+)");
+    processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
+
+    processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
+
+    /*
+        Replace spaces with the separator token same as in line 365
+
+        for (auto & c : prompt_user) {
+        if (c == ' ') {
+            prompt_clean += "<|text_sep|>";
+    */
+    std::string separator = (tts_version == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
+    processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), separator);
+
+    return processed_text;
+}
+
+static void prompt_add(llama_tokens & prompt, llama_token token) {
+    prompt.push_back(token);
+}
+
+static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) {
+    prompt.insert(prompt.end(), tokens.begin(), tokens.end());
+}
+
+static void prompt_add(llama_tokens & prompt, const llama_vocab * vocab, const std::string & txt, bool add_special, bool parse_special) {
+    auto tmp = common_tokenize(vocab, txt, add_special, parse_special);
+    prompt_add(prompt, tmp);
+}
+
+static void prompt_init(llama_tokens & prompt, const llama_vocab * vocab) {
+    prompt.clear();
+
+    prompt_add(prompt, vocab, "<|im_start|>\n", true, true);
+}
+
+static std::vector<llama_token> prepare_guide_tokens(const llama_vocab * vocab, const std::string & str, const outetts_version tts_version = OUTETTS_V0_2) {
+    const std::string& delimiter = (tts_version == OUTETTS_V0_3 ? "<|space|>" : "<|text_sep|>");
+
+    std::vector<llama_token> result;
+    size_t start = 0;
+    size_t end = str.find(delimiter);
+
+    //first token is always a newline, as it was not previously added
+    result.push_back(common_tokenize(vocab, "\n", false, true)[0]);
+
+    while (end != std::string::npos) {
+        std::string current_word = str.substr(start, end - start);
+        auto tmp = common_tokenize(vocab, current_word, false, true);
+        result.push_back(tmp[0]);
+        start = end + delimiter.length();
+        end = str.find(delimiter, start);
+    }
+
+    // Add the last part
+    std::string current_word = str.substr(start);
+    auto tmp = common_tokenize(vocab, current_word, false, true);
+    if (tmp.size() > 0) {
+        result.push_back(tmp[0]);
+    }
+    return result;
+}
+
+static json speaker_from_file(const std::string & speaker_file) {
+    std::ifstream file(speaker_file);
+    if (!file) {
+        LOG_ERR("%s: Failed to open file '%s' for reading\n", __func__, speaker_file.c_str());
+        return json();
+    }
+
+    json speaker = json::parse(file);
+    return speaker;
+}
+
+static outetts_version get_tts_version(llama_model *model, json speaker = json::object()) {
+    if (speaker.contains("version")) {
+        std::string version = speaker["version"].get<std::string>();
+        if (version == "0.2") {
+            return OUTETTS_V0_2;
+        } else if (version == "0.3") {
+            return OUTETTS_V0_3;
+        } else {
+            LOG_ERR("%s: Unsupported speaker version '%s'\n", __func__, version.c_str());
+        }
+    }
+
+    // Also could get version from model itself
+    const char *chat_template = llama_model_chat_template(model, nullptr);
+    if (chat_template && std::string(chat_template) == "outetts-0.3") {
+        return OUTETTS_V0_3;
+    }
+
+    // Use 0.2 as the default version
+    return OUTETTS_V0_2;
+}
+
+static std::string audio_text_from_speaker(json speaker, const outetts_version tts_version = OUTETTS_V0_2) {
+    std::string audio_text = "<|text_start|>";
+
+    if (tts_version == OUTETTS_V0_2 || tts_version == OUTETTS_V0_3) {
+        std::string separator = (tts_version == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
+        for (const auto &word : speaker["words"]) {
+            audio_text += word["word"].get<std::string>() + separator;
+        }
+    }
+
+    return audio_text;
+}
+
+static std::string audio_data_from_speaker(json speaker, const outetts_version tts_version = OUTETTS_V0_2) {
+    std::string audio_data = "<|audio_start|>\n";
+
+    if (tts_version == OUTETTS_V0_2 || tts_version == OUTETTS_V0_3) {
+        std::string code_start = (tts_version == OUTETTS_V0_3) ? "" : "<|code_start|>";
+        std::string code_end = (tts_version == OUTETTS_V0_3) ? "<|space|>" : "<|code_end|>";
+        for (const auto &word : speaker["words"]) {
+            std::string word_text = word["word"].get<std::string>();
+            double duration = word["duration"].get<double>();
+            std::vector<int> codes = word["codes"].get<std::vector<int>>();
+
+            // Create the audio output entry
+            std::ostringstream word_entry;
+            word_entry << word_text << "<|t_" << std::fixed << std::setprecision(2)
+                       << duration << "|>" + code_start;
+            for (const auto &Code : codes) {
+                word_entry << "<|" << Code << "|>";
+            }
+            word_entry << code_end << "\n";
+            audio_data += word_entry.str();
+        }
+    }
+
+    return audio_data;
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.out_file = "output.wav";
+    params.prompt = "";
+
+    params.n_predict = 4096;
+    params.n_batch   = 8192;
+    params.n_ctx     = 8192;
+
+    params.sampling.top_k = 4;
+    params.sampling.samplers = { COMMON_SAMPLER_TYPE_TOP_K, };
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) {
+        return 1;
+    }
+
+    const int n_parallel = params.n_parallel;
+    const int n_predict  = params.n_predict;
+
+    common_init();
+
+    // init LLM
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model_ttc = NULL; // text-to-codes
+    llama_model * model_cts = NULL; // codes-to-speech
+
+    llama_context * ctx_ttc = NULL;
+    llama_context * ctx_cts = NULL;
+
+    auto llama_init_ttc = common_init_from_params(params);
+
+    model_ttc = llama_init_ttc->model();
+    ctx_ttc   = llama_init_ttc->context();
+
+    if (model_ttc == nullptr || ctx_ttc == nullptr) {
+        return ENOENT;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
+
+    params.model = params.vocoder.model;
+    params.embedding = true;
+    params.n_ubatch = params.n_batch;
+
+    auto llama_init_cts = common_init_from_params(params);
+
+    model_cts = llama_init_cts->model();
+    ctx_cts   = llama_init_cts->context();
+
+    if (model_cts == nullptr || ctx_cts == nullptr) {
+        return ENOENT;
+    }
+
+    std::vector<common_sampler *> smpl(n_parallel);
+    for (int i = 0; i < n_parallel; ++i) {
+        params.sampling.no_perf = (i != 0);
+        params.sampling.seed = params.sampling.seed + 1;
+
+        smpl[i] = common_sampler_init(model_ttc, params.sampling);
+    }
+
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl[0]));
+    LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl[0]).c_str());
+
+    LOG_INF("%s: loading done\n", __func__);
+
+    const auto t_main_start = ggml_time_us();
+
+    std::vector<llama_token> codes;
+    std::vector<llama_token> guide_tokens;
+
+    // the default speaker profile is from: https://github.com/edwko/OuteTTS/blob/main/outetts/version/v1/default_speakers/en_male_1.json
+    std::string audio_text = "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>";
+    std::string audio_data = R"(<|audio_start|>
+the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
+overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
+package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
+from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|>
+just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|>
+two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|>
+people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|>
+is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|>
+pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|>
+remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|>
+sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|>
+i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|>
+have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|>
+some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|>
+critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|>
+about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|>
+some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|>
+of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|>
+the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|>
+gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|>
+aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|>
+but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|>
+its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|>
+still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|>
+really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|>
+enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|>
+and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|>
+it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
+looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
+lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
+
+    // audio data for 0.3 version
+    outetts_version tts_version = get_tts_version(model_ttc);
+    if (tts_version == OUTETTS_V0_3) {
+        audio_text = std::regex_replace(audio_text, std::regex(R"(<\|text_sep\|>)"), "<|space|>");
+        audio_data = std::regex_replace(audio_data, std::regex(R"(<\|code_start\|>)"), "");
+        audio_data = std::regex_replace(audio_data, std::regex(R"(<\|code_end\|>)"), "<|space|>");
+    }
+
+    // load speaker if given
+    if (!params.vocoder.speaker_file.empty()) {
+        LOG_INF("%s: loading speaker ..\n", __func__);
+        json speaker = speaker_from_file(params.vocoder.speaker_file);
+        if (speaker.empty()) {
+            LOG_ERR("%s: Failed to load speaker file '%s'\n", __func__, params.vocoder.speaker_file.c_str());
+            return 1;
+        }
+        audio_text = audio_text_from_speaker(speaker, tts_version);
+        audio_data = audio_data_from_speaker(speaker, tts_version);
+    }
+
+    // process prompt and generate voice codes
+    {
+        LOG_INF("%s: constructing prompt ..\n", __func__);
+
+        std::vector<llama_token> prompt_inp;
+
+        prompt_init(prompt_inp, vocab);
+
+        prompt_add(prompt_inp, vocab, audio_text, false, true);
+
+        // convert the input text into the necessary format expected by OuteTTS
+        {
+            std::string prompt_clean = process_text(params.prompt, tts_version);
+            if (params.vocoder.use_guide_tokens) {
+                guide_tokens = prepare_guide_tokens(vocab, prompt_clean, tts_version);
+            }
+
+            LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
+
+            prompt_add(prompt_inp, vocab, prompt_clean, false, true);
+        }
+
+        prompt_add(prompt_inp, vocab, "<|text_end|>\n", false, true);
+
+        if (!params.vocoder.speaker_file.empty()) {
+            prompt_add(prompt_inp, vocab, audio_data, false, true);
+        } else {
+            // disabled to save time on tokenizing each time
+#if 1
+            const std::string voice_data = audio_data;
+
+            auto tmp = common_tokenize(vocab, voice_data, false, true);
+
+            std::ostringstream tokens_oss;
+            for (size_t i = 0; i < tmp.size(); ++i) {
+                tokens_oss << tmp[i] << ", ";
+            }
+            LOG_INF("\n\n%s: llama tokens: %s\n\n", __func__, tokens_oss.str().c_str());
+
+            prompt_add(prompt_inp, tmp);
+#else
+            prompt_add(prompt_inp, llama_tokens {
+                151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585,
+                152460, 153375, 151670, 198, 74455, 155808, 151669, 151799,
+                151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470,
+                151970, 153413, 152419, 153334, 153289, 153374, 153199, 152040,
+                153260, 152721, 152680, 153297, 152419, 153248, 152400, 152691,
+                153368, 153437, 151670, 198, 1722, 155828, 151669, 152607,
+                152256, 152991, 152299, 152688, 153163, 153016, 152789, 153198,
+                152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207,
+                152461, 153321, 153309, 151750, 152137, 153340, 152573, 152267,
+                153347, 151789, 152681, 153339, 151992, 152512, 151751, 152179,
+                153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904,
+                152311, 151670, 198, 1499, 155791, 151669, 152276, 152454,
+                153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226,
+                153043, 152325, 153267, 152622, 151670, 198, 4250, 155797,
+                151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271,
+                152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213,
+                152112, 153204, 151722, 152542, 151670, 198, 19789, 155796,
+                151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002,
+                152191, 151734, 152312, 152810, 152237, 153224, 153169, 153224,
+                152244, 153387, 153404, 151670, 198, 16069, 155811, 151669,
+                152265, 151946, 151808, 152412, 152363, 152305, 153156, 152733,
+                152810, 153157, 152016, 152100, 152069, 153234, 152317, 152589,
+                152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504,
+                153376, 152272, 152433, 152325, 151941, 151670, 198, 285,
+                155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381,
+                152474, 152680, 152157, 153255, 152324, 151682, 151670, 198,
+                32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682,
+                152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488,
+                153070, 151883, 152890, 152489, 153144, 153375, 152358, 151685,
+                152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669,
+                151902, 152720, 153377, 152027, 152378, 152821, 153207, 153459,
+                153028, 153068, 152507, 153255, 152158, 152921, 151958, 152609,
+                152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470,
+                152606, 152162, 152186, 153071, 152244, 153118, 153375, 153018,
+                152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736,
+                153380, 153502, 152702, 152115, 153181, 152735, 153277, 153457,
+                152393, 153112, 152595, 151670, 198, 19098, 155808, 151669,
+                152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239,
+                153163, 152922, 153402, 152034, 152591, 153438, 152215, 151673,
+                152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482,
+                152718, 152862, 153347, 151670, 198, 72, 155780, 151669, 151795,
+                152111, 152746, 152377, 153471, 152309, 151670, 198, 19016,
+                155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701,
+                152939, 152536, 152091, 151815, 152733, 151672, 151670, 198,
+                14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042,
+                153504, 152589, 153333, 151839, 151941, 153038, 153180, 151670,
+                198, 36996, 8303, 155832, 151669, 152231, 152256, 152835,
+                152801, 152985, 153400, 152393, 152818, 152765, 152249, 152600,
+                151699, 152302, 152752, 153018, 153009, 151992, 153054, 152847,
+                153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458,
+                152048, 152757, 152428, 153195, 151906, 153006, 153178, 153250,
+                152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418,
+                152228, 152733, 151670, 198, 9096, 155801, 151669, 151698,
+                153321, 152217, 153039, 152935, 153400, 152122, 152531, 153106,
+                152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851,
+                152901, 152885, 152594, 153446, 153080, 151670, 198, 14689,
+                155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191,
+                151673, 151690, 151698, 152714, 152846, 152981, 153171, 153384,
+                153364, 153188, 153246, 151670, 198, 1055, 155779, 151669,
+                151869, 152388, 152711, 153334, 151736, 151670, 198, 1782,
+                155780, 151669, 153483, 153240, 152241, 152558, 152697, 153046,
+                151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605,
+                153034, 153434, 153372, 153347, 151887, 152453, 152758, 152133,
+                152510, 152694, 152431, 152321, 153088, 152676, 152223, 152581,
+                152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032,
+                152903, 152859, 152989, 151748, 152669, 152661, 152650, 152409,
+                151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469,
+                152988, 152894, 151819, 152391, 153019, 152058, 153062, 153230,
+                151826, 152112, 152306, 152264, 152769, 153390, 152384, 152435,
+                152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540,
+                151919, 151893, 152558, 152817, 152946, 152956, 152129, 152715,
+                153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450,
+                151670, 198, 8088, 155792, 151669, 152452, 153497, 153353,
+                152679, 152533, 152382, 152374, 152611, 153341, 153163, 152285,
+                153411, 152495, 153141, 152320, 151670, 198, 1199, 155781,
+                151669, 151764, 152360, 153295, 152634, 153342, 152199, 152271,
+                151670, 198, 43366, 155799, 151669, 152308, 151682, 152889,
+                152016, 152385, 152629, 152495, 151826, 153321, 152958, 152180,
+                151886, 153432, 152922, 152128, 153024, 153040, 152593, 152287,
+                151677, 151670, 198, 53660, 155808, 151669, 151727, 152092,
+                152680, 153331, 151699, 152316, 152938, 152289, 152433, 153384,
+                151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691,
+                152489, 151941, 152049, 152034, 153053, 152179, 153160, 151676,
+                153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350,
+                152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234,
+                153135, 152291, 153235, 152143, 152583, 152402, 153483, 152678,
+                152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825,
+                152548, 153442, 152109, 152659, 153325, 152781, 152570, 152957,
+                151752, 152265, 153381, 152515, 151670, 198, 437, 155787,
+                151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174,
+                151792, 153409, 153327, 152990, 151670, 198, 275, 155781,
+                151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974,
+                151670, 198, 94273, 155799, 151669, 152953, 152938, 153427,
+                152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331,
+                152257, 152987, 152777, 153448, 152408, 151696, 152408, 152326,
+                152699, 151670, 198, 385, 16239, 155828, 151669, 152306, 152268,
+                153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110,
+                152918, 152923, 152467, 152331, 153053, 153330, 151889, 153444,
+                152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751,
+                152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499,
+                152109, 152255, 151739, 152267, 152759, 153318, 153165, 153349,
+                151670,});
+#endif
+        }
+
+        // print the prompt token-by-token
+
+        LOG("\n");
+
+        for (auto id : prompt_inp) {
+            LOG("%s", common_token_to_piece(ctx_ttc, id).c_str());
+        }
+
+        LOG_INF("%s: prompt size: %d\n", __func__, (int) prompt_inp.size());
+
+        LOG("\n");
+
+        // create a llama_batch
+        // we use this object to submit token data for decoding
+        llama_batch batch = llama_batch_init(std::max(prompt_inp.size(), (size_t) n_parallel), 0, n_parallel);
+
+        std::vector<llama_seq_id> seq_ids(n_parallel, 0);
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            seq_ids[i] = i;
+        }
+
+        // evaluate the initial prompt
+        for (size_t i = 0; i < prompt_inp.size(); ++i) {
+            common_batch_add(batch, prompt_inp[i], i, seq_ids, false);
+        }
+        GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size());
+
+        // llama_decode will output logits only for the last token of the prompt
+        batch.logits[batch.n_tokens - 1] = true;
+
+        if (llama_decode(ctx_ttc, batch) != 0) {
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            return 1;
+        }
+
+        if (n_parallel > 1) {
+            LOG_INF("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+        }
+
+        llama_synchronize(ctx_ttc);
+
+        LOG_INF("%s: time for prompt: %.3f ms\n\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
+
+        const auto t_dec_start = ggml_time_us();
+
+        // main loop
+
+        // remember the batch index of the last token for each parallel sequence
+        // we need this to determine which logits to sample from
+        std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
+
+        int n_past   = batch.n_tokens;
+        int n_decode = 0;
+
+        bool next_token_uses_guide_token = true;
+
+        while (n_decode <= n_predict) {
+            // prepare the next batch
+            common_batch_clear(batch);
+
+            // sample the next token for each parallel sequence / stream
+            for (int32_t i = 0; i < n_parallel; ++i) {
+                if (i_batch[i] < 0) {
+                    // the stream has already finished
+                    continue;
+                }
+
+                llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
+
+                //guide tokens help prevent hallucinations by forcing the TTS to use the correct word
+                if (!guide_tokens.empty() && next_token_uses_guide_token && !llama_vocab_is_control(vocab, new_token_id) && !llama_vocab_is_eog(vocab, new_token_id)) {
+                    llama_token guide_token = guide_tokens[0];
+                    guide_tokens.erase(guide_tokens.begin());
+                    new_token_id = guide_token; //ensure correct word fragment is used
+                }
+
+                //this is the token id that always precedes a new word
+                next_token_uses_guide_token = (new_token_id == 198);
+
+                common_sampler_accept(smpl[i], new_token_id, true);
+
+                codes.push_back(new_token_id);
+
+                const auto * cands = common_sampler_get_candidates(smpl[i], false);
+
+                // is it an end of generation? -> mark the stream as finished
+                if (llama_vocab_is_eog(vocab, new_token_id) || n_decode == n_predict) {
+                    std::string reason;
+                    if (llama_vocab_is_eog(vocab, new_token_id)) {
+                        reason = "eos";
+                    } else {
+                        reason = "n_predict";
+                    }
+
+                    i_batch[i] = -1;
+
+                    LOG("\n");
+                    if (n_parallel > 1) {
+                        LOG_CNT("\n");
+                        LOG_INF("%s: stream %d finished at n_past = %d, reason = '%s'\n", __func__, i, n_past, reason.c_str());
+                    }
+
+                    continue;
+                }
+
+                {
+                    const float p = cands->data[cands->selected].p;
+
+                    const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) ((3*p)*float(k_colors.size()))));
+
+                    LOG_CNT("%s%d%s", k_colors[col].c_str(), i, "\033[0m");
+                    //LOG_CNT("%d", i);
+                }
+
+                i_batch[i] = batch.n_tokens;
+
+                // push this new token for next evaluation
+                common_batch_add(batch, new_token_id, n_past, { i }, true);
+            }
+
+            // all streams are finished
+            if (batch.n_tokens == 0) {
+                break;
+            }
+
+            n_decode += 1;
+            n_past += 1;
+
+            // evaluate the current batch with the transformer model
+            if (llama_decode(ctx_ttc, batch)) {
+                LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
+                return 1;
+            }
+        }
+
+        llama_batch_free(batch);
+
+        LOG("\n");
+        LOG_INF("%s: time for decoder:       %.3f ms\n", __func__, (ggml_time_us() - t_dec_start) / 1000.0f);
+    }
+
+    common_perf_print(ctx_ttc, smpl[0]);
+
+    //std::vector<llama_token> codes = {198, 88225, 155856, 151669, 152205,
+    //    153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695,
+    //    153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010,
+    //    153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286,
+    //    152227, 153300, 152934, 152263, 153208, 152263, 152965, 152430, 152296,
+    //    153146, 152920, 152376, 152556, 153363, 151775, 152044, 152972, 152690,
+    //    153379, 152368, 152233, 153422, 152490, 151996, 152022, 151694, 152061,
+    //    153238, 152539, 153356, 152640, 153021, 153123, 151962, 153094, 151670,
+    //    198, 20339, 13189, 155824, 151669, 152070, 152007, 152910, 151683,
+    //    152000, 152373, 152760, 152046, 151735, 152334, 152394, 153073, 152908,
+    //    151856, 151953, 153247, 153293, 151903, 153480, 153168, 152478, 153359,
+    //    153429, 151905, 151678, 152567, 152411, 152165, 152556, 153075, 153424,
+    //    151993, 152999, 153078, 152151, 152088, 153389, 152484, 151874, 151670,
+    //    198, 285, 155784, 151669, 152226, 152126, 152638, 153215, 151729,
+    //    152959, 153479, 153059, 151838, 151670, 198, 1782, 155783, 151669,
+    //    153288, 153055, 153314, 152497, 152962, 152741, 152076, 153253, 151670,
+    //    198, 471, 16488, 155825, 151669, 152060, 152916, 151893, 153469, 152501,
+    //    152080, 152743, 151932, 153161, 152096, 152761, 152698, 153401, 153242,
+    //    153336, 152441, 152838, 153467, 152706, 153496, 153310, 152422, 153360,
+    //    153115, 152763, 151998, 152373, 153450, 152554, 151968, 153323, 152055,
+    //    152468, 153111, 153358, 152813, 152010, 151770, 152823, 152960, 151670,
+    //    198, 22627, 155823, 151669, 152814, 152366, 153484, 152931, 153441,
+    //    152164, 152877, 152915, 153463, 151692, 152911, 152747, 152776, 151831,
+    //    153449, 151882, 152975, 152031, 152513, 153150, 152448, 152667, 153133,
+    //    153189, 152619, 153466, 152054, 152106, 153119, 152277, 152439, 153109,
+    //    152997, 152141, 153154, 153256, 153311, 151922, 151670, 198, 1055,
+    //    155781, 151669, 152633, 151850, 153060, 153270, 152560, 153348, 152729,
+    //    151670, 198, 25312, 155803, 151669, 152521, 153403, 152561, 153337,
+    //    153383, 152199, 153493, 153326, 151830, 152254, 152248, 152349, 152153,
+    //    153007, 151823, 153037, 152575, 152457, 152406, 152592, 153116, 153365,
+    //    153456, 151670, 198, 88225, 155817, 151669, 153271, 151925, 152218,
+    //    152418, 152253, 153140, 151903, 153151, 152626, 152338, 152647, 153464,
+    //    152785, 152768, 151711, 152037, 152033, 151804, 152216, 151701, 151855,
+    //    152348, 152995, 152955, 152905, 152342, 152340, 153391, 153453, 152418,
+    //    153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645};
+
+    {
+        const std::string inp_txt = common_detokenize(ctx_ttc, codes, true);
+
+        LOG("\n");
+        LOG_INF("codes: '%s'\n", inp_txt.c_str());
+        LOG_INF("%s: codes size: %d\n", __func__, (int) codes.size());
+    }
+
+    // remove all non-audio tokens (i.e. < 151672 || > 155772)
+    codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end());
+
+    {
+        const std::string inp_txt = common_detokenize(ctx_ttc, codes, true);
+        LOG_INF("codes audio: '%s'\n", inp_txt.c_str());
+        LOG_INF("%s: codes audio size: %d\n", __func__, (int) codes.size());
+    }
+
+    for (auto & token : codes) {
+        token -= 151672;
+    }
+
+    const auto t_voc_start = ggml_time_us();
+
+    const int n_codes = codes.size();
+
+    llama_batch batch = llama_batch_init(n_codes, 0, 1);
+
+    for (size_t i = 0; i < codes.size(); ++i) {
+        common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits?
+    }
+    GGML_ASSERT(batch.n_tokens == n_codes);
+
+    if (llama_encode(ctx_cts, batch) != 0) {
+        LOG_ERR("%s: llama_encode() failed\n", __func__);
+        return 1;
+    }
+
+    llama_synchronize(ctx_cts);
+
+    LOG_INF("%s: time for vocoder:      %.3f ms\n", __func__, (ggml_time_us() - t_voc_start) / 1000.0f);
+
+    const auto t_spec_start = ggml_time_us();
+
+#if 1
+    // spectral operations
+    const int n_embd = llama_model_n_embd_out(model_cts);
+    const float * embd = llama_get_embeddings(ctx_cts);
+
+    auto audio = embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads);
+
+#else
+    // read the spectrogram from a file for debugging purposes
+    std::vector<float> audio;
+    {
+        std::ifstream fin("out.bin", std::ios::binary);
+        if (!fin) {
+            LOG_ERR("%s: failed to open file '%s'\n", __func__, "out.bin");
+            return 1;
+        }
+
+        std::vector<float> embd;
+
+        int n_codes;
+        int n_embd;
+
+        fin.read(reinterpret_cast<char *>(&n_codes), sizeof(int));
+        fin.read(reinterpret_cast<char *>(&n_embd), sizeof(int));
+
+        embd.resize(n_codes * n_embd);
+        fin.read(reinterpret_cast<char *>(embd.data()), n_codes * n_embd * sizeof(float));
+        fin.close();
+
+        LOG_INF("%s: n_codes: %d, n_embd: %d\n", __func__, n_codes, n_embd);
+
+        audio = embd_to_audio(embd.data(), n_codes, n_embd, params.cpuparams.n_threads);
+    }
+#endif
+
+    const int n_sr = 24000; // sampling rate
+
+    // zero out first 0.25 seconds
+    for (int i = 0; i < 24000/4; ++i) {
+        audio[i] = 0.0f;
+    }
+
+    LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
+    LOG_INF("%s: total time:            %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
+
+    int retval = 0;
+
+    if (save_wav16(params.out_file, audio, n_sr)) {
+        LOG_INF("%s: audio written to file '%s'\n", __func__, params.out_file.c_str());
+    } else {
+        retval = ENOENT;
+    }
+
+    llama_backend_free();
+
+    return retval;
+}
-- 
cgit v1.2.3